Files
impala/testdata/workloads/functional-planner/queries/PlannerTest/outer-joins.test
Tianyi Wang 5e9b4e2fd2 IMPALA-5976: Remove equivalence class computation in FE
Equivalent class is used to get the equivalencies between slots. It is
ill-defined and the current implementation is inefficient. This patch
removes it and directly uses the information from the value transfer
graph instead.
Value transfer graph is reimplemented using Tarjan's strongly connected
component algorithm and BFS with adjacency lists to speed up on both
condensed and sparse graphs.

Testing: It passes the existing tests. In planner tests the equivalence
between SCC-condensed graph and uncondensed graph is checked. A test
case is added for a helper class IntArrayList. An outer-join edge case
is added in planner test. On a query with 1800 union operations, the
equivalence class computation time is reduced from 7m57s to 65ms and the
planning time is reduced from 8m5s to 13s.

Change-Id: If4cb1d8be46efa8fd61a97048cc79dabe2ffa51a
Reviewed-on: http://gerrit.cloudera.org:8080/8317
Reviewed-by: Alex Behm <alex.behm@cloudera.com>
Tested-by: Impala Public Jenkins
2017-11-18 09:07:06 +00:00

1157 lines
36 KiB
Plaintext

# correct placement of predicates with left outer joins; t2 and t3 are nullable
select *
from functional.testtbl t1 left outer join functional.testtbl t2 on (
t1.id - 1 = t2.id + 1
# lhs predicate is join predicate
and t1.zip = 94611
# rhs predicate applied by rhs scan
and t2.zip = 94104)
left outer join functional.testtbl t3 on (
t1.id = t3.id
# predicate on t2 is join predicate, not applied by t2 scan
and t2.id = 15
# predicate on t1 and t2 is join predicate
and t1.id - t2.id = 0
# rhs predicate applied by rhs scan
and t3.zip = 94720
)
where
# t1 predicate in where clause is applied by scans (t1; also propagated to t3)
t1.id > 0
# t2 predicate in where clause is applied by join and scan
and t2.id is null
# t3 predicate in where clause is applied by join and scan
and t3.id is not null
# join predicate between t1 and t2 applied after t2 join
and t1.zip + t2.zip = 10
# join predicate between t1, t2 and t3 applied after last join
and t1.zip + t2.zip + t3.zip= 20
---- PLAN
PLAN-ROOT SINK
|
04:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: t1.id = t3.id
| other join predicates: t2.id = 15, t1.id - t2.id = 0
| other predicates: t3.id IS NOT NULL, t1.zip + t2.zip + t3.zip = 20
|
|--02:SCAN HDFS [functional.testtbl t3]
| partitions=1/1 files=0 size=0B
| predicates: t3.id IS NOT NULL, t3.id > 0, t3.zip = 94720
|
03:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: t1.id - 1 = t2.id + 1
| other join predicates: t1.zip = 94611
| other predicates: t2.id IS NULL, t1.zip + t2.zip = 10
|
|--01:SCAN HDFS [functional.testtbl t2]
| partitions=1/1 files=0 size=0B
| predicates: t2.zip = 94104
|
00:SCAN HDFS [functional.testtbl t1]
partitions=1/1 files=0 size=0B
predicates: t1.id > 0
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
07:EXCHANGE [UNPARTITIONED]
|
04:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
| hash predicates: t1.id = t3.id
| other join predicates: t2.id = 15, t1.id - t2.id = 0
| other predicates: t3.id IS NOT NULL, t1.zip + t2.zip + t3.zip = 20
|
|--06:EXCHANGE [BROADCAST]
| |
| 02:SCAN HDFS [functional.testtbl t3]
| partitions=1/1 files=0 size=0B
| predicates: t3.id IS NOT NULL, t3.id > 0, t3.zip = 94720
|
03:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
| hash predicates: t1.id - 1 = t2.id + 1
| other join predicates: t1.zip = 94611
| other predicates: t2.id IS NULL, t1.zip + t2.zip = 10
|
|--05:EXCHANGE [BROADCAST]
| |
| 01:SCAN HDFS [functional.testtbl t2]
| partitions=1/1 files=0 size=0B
| predicates: t2.zip = 94104
|
00:SCAN HDFS [functional.testtbl t1]
partitions=1/1 files=0 size=0B
predicates: t1.id > 0
====
# the same thing with subqueries; should produce the same result
select *
from (select * from functional.testtbl a1) t1
left outer join (select * from functional.testtbl a2) t2 on (
t1.id - 1 = t2.id + 1 and t1.zip = 94611 and t2.zip = 94104)
left outer join (select * from functional.testtbl a3) t3 on (
t1.id = t3.id and t2.id = 15 and t1.id - t2.id = 0 and t3.zip = 94720)
where t1.id > 0 and t2.id is null and t3.id is not null
and t1.zip + t2.zip = 10 and t1.zip + t2.zip + t3.zip= 20
---- PLAN
PLAN-ROOT SINK
|
04:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: a1.id = a3.id
| other join predicates: a2.id = 15, a1.id - a2.id = 0
| other predicates: a3.id IS NOT NULL, a1.zip + a2.zip + a3.zip = 20
|
|--02:SCAN HDFS [functional.testtbl a3]
| partitions=1/1 files=0 size=0B
| predicates: a3.id IS NOT NULL, a3.id > 0, a3.zip = 94720
|
03:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: a1.id - 1 = a2.id + 1
| other join predicates: a1.zip = 94611
| other predicates: a2.id IS NULL, a1.zip + a2.zip = 10
|
|--01:SCAN HDFS [functional.testtbl a2]
| partitions=1/1 files=0 size=0B
| predicates: a2.zip = 94104
|
00:SCAN HDFS [functional.testtbl a1]
partitions=1/1 files=0 size=0B
predicates: a1.id > 0
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
07:EXCHANGE [UNPARTITIONED]
|
04:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
| hash predicates: a1.id = a3.id
| other join predicates: a2.id = 15, a1.id - a2.id = 0
| other predicates: a3.id IS NOT NULL, a1.zip + a2.zip + a3.zip = 20
|
|--06:EXCHANGE [BROADCAST]
| |
| 02:SCAN HDFS [functional.testtbl a3]
| partitions=1/1 files=0 size=0B
| predicates: a3.id IS NOT NULL, a3.id > 0, a3.zip = 94720
|
03:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
| hash predicates: a1.id - 1 = a2.id + 1
| other join predicates: a1.zip = 94611
| other predicates: a2.id IS NULL, a1.zip + a2.zip = 10
|
|--05:EXCHANGE [BROADCAST]
| |
| 01:SCAN HDFS [functional.testtbl a2]
| partitions=1/1 files=0 size=0B
| predicates: a2.zip = 94104
|
00:SCAN HDFS [functional.testtbl a1]
partitions=1/1 files=0 size=0B
predicates: a1.id > 0
====
# correct propagation of scan predicates in OJ On clauses:
# id = 17 must not be applied by the t1 and t3 scans
select *
from functional.testtbl t1
left outer join functional.testtbl t2 on (
t1.id = t2.id and t1.id = 17)
join functional.testtbl t3 on (t1.id = t3.id)
---- PLAN
PLAN-ROOT SINK
|
04:HASH JOIN [INNER JOIN]
| hash predicates: t1.id = t3.id
| runtime filters: RF000 <- t3.id
|
|--02:SCAN HDFS [functional.testtbl t3]
| partitions=1/1 files=0 size=0B
|
03:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: t1.id = t2.id
| other join predicates: t1.id = 17
|
|--01:SCAN HDFS [functional.testtbl t2]
| partitions=1/1 files=0 size=0B
| predicates: t2.id = 17
| runtime filters: RF000 -> t2.id
|
00:SCAN HDFS [functional.testtbl t1]
partitions=1/1 files=0 size=0B
runtime filters: RF000 -> t1.id
====
# correct placement of predicates with right outer joins; t1 and t2 are nullable
select *
from functional.testtbl t1 right outer join functional.testtbl t2 on (
t1.id - 1 = t2.id + 1
# lhs predicate is scan predicate
and t1.zip = 94611
# rhs predicate is join predicate
and t2.zip = 94104)
right outer join functional.testtbl t3 on (
t1.id = t3.id
# predicate on t2 is scan predicate
and t2.id = 15
# predicate on t1 and t2 could be applied by previous join
# but the planner doesn't recognize that case and makes it a join predicate
and t1.id - t2.id = 0
# rhs predicate is join predicate
and t3.zip = 94720
)
where
# t1 predicate in where clause is applied by scan and last join
t1.id > 0
# t2 predicate in where clause is applied by last join
and t2.id is null
# t3 predicate in where clause is applied by scan
and t3.id is not null
# join predicate between t1 and t2 applied after last join
and t1.zip + t2.zip = 10
# join predicate between t1, t2 and t3 applied after last join
and t1.zip + t2.zip + t3.zip= 20
---- PLAN
PLAN-ROOT SINK
|
04:HASH JOIN [RIGHT OUTER JOIN]
| hash predicates: t1.id = t3.id
| other join predicates: t3.zip = 94720, t1.id - t2.id = 0
| other predicates: t2.id IS NULL, t1.id > 0, t1.zip + t2.zip = 10, t1.zip + t2.zip + t3.zip = 20
| runtime filters: RF000 <- t3.id
|
|--02:SCAN HDFS [functional.testtbl t3]
| partitions=1/1 files=0 size=0B
| predicates: t3.id IS NOT NULL
|
03:HASH JOIN [RIGHT OUTER JOIN]
| hash predicates: t1.id - 1 = t2.id + 1
| other join predicates: t1.zip = 94611, t2.zip = 94104
| runtime filters: RF002 <- t2.id + 1
|
|--01:SCAN HDFS [functional.testtbl t2]
| partitions=1/1 files=0 size=0B
| predicates: t2.id = 15
|
00:SCAN HDFS [functional.testtbl t1]
partitions=1/1 files=0 size=0B
predicates: t1.id IS NOT NULL, t1.id > 0
runtime filters: RF000 -> t1.id, RF002 -> t1.id - 1
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
09:EXCHANGE [UNPARTITIONED]
|
04:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
| hash predicates: t1.id = t3.id
| other join predicates: t3.zip = 94720, t1.id - t2.id = 0
| other predicates: t2.id IS NULL, t1.id > 0, t1.zip + t2.zip = 10, t1.zip + t2.zip + t3.zip = 20
| runtime filters: RF000 <- t3.id
|
|--08:EXCHANGE [HASH(t3.id)]
| |
| 02:SCAN HDFS [functional.testtbl t3]
| partitions=1/1 files=0 size=0B
| predicates: t3.id IS NOT NULL
|
07:EXCHANGE [HASH(t1.id)]
|
03:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
| hash predicates: t1.id - 1 = t2.id + 1
| other join predicates: t1.zip = 94611, t2.zip = 94104
| runtime filters: RF002 <- t2.id + 1
|
|--06:EXCHANGE [HASH(t2.id + 1)]
| |
| 01:SCAN HDFS [functional.testtbl t2]
| partitions=1/1 files=0 size=0B
| predicates: t2.id = 15
|
05:EXCHANGE [HASH(t1.id - 1)]
|
00:SCAN HDFS [functional.testtbl t1]
partitions=1/1 files=0 size=0B
predicates: t1.id IS NOT NULL, t1.id > 0
runtime filters: RF000 -> t1.id, RF002 -> t1.id - 1
====
# the same thing with subqueries; should produce the same result
select *
from (select * from functional.testtbl a1) t1 right outer join (select * from functional.testtbl a2) t2 on (
t1.id - 1 = t2.id + 1 and t1.zip = 94611 and t2.zip = 94104)
right outer join (select * from functional.testtbl a3) t3 on (
t1.id = t3.id and t2.id = 15 and t1.id - t2.id = 0 and t3.zip = 94720 )
where t1.id > 0 and t2.id is null and t3.id is not null
and t1.zip + t2.zip = 10 and t1.zip + t2.zip + t3.zip= 20
---- PLAN
PLAN-ROOT SINK
|
04:HASH JOIN [RIGHT OUTER JOIN]
| hash predicates: a1.id = a3.id
| other join predicates: a3.zip = 94720, a1.id - a2.id = 0
| other predicates: a2.id IS NULL, a1.id > 0, a1.zip + a2.zip = 10, a1.zip + a2.zip + a3.zip = 20
| runtime filters: RF000 <- a3.id
|
|--02:SCAN HDFS [functional.testtbl a3]
| partitions=1/1 files=0 size=0B
| predicates: a3.id IS NOT NULL
|
03:HASH JOIN [RIGHT OUTER JOIN]
| hash predicates: a1.id - 1 = a2.id + 1
| other join predicates: a1.zip = 94611, a2.zip = 94104
| runtime filters: RF002 <- a2.id + 1
|
|--01:SCAN HDFS [functional.testtbl a2]
| partitions=1/1 files=0 size=0B
| predicates: a2.id = 15
|
00:SCAN HDFS [functional.testtbl a1]
partitions=1/1 files=0 size=0B
predicates: a1.id IS NOT NULL, a1.id > 0
runtime filters: RF000 -> a1.id, RF002 -> a1.id - 1
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
09:EXCHANGE [UNPARTITIONED]
|
04:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
| hash predicates: a1.id = a3.id
| other join predicates: a3.zip = 94720, a1.id - a2.id = 0
| other predicates: a2.id IS NULL, a1.id > 0, a1.zip + a2.zip = 10, a1.zip + a2.zip + a3.zip = 20
| runtime filters: RF000 <- a3.id
|
|--08:EXCHANGE [HASH(a3.id)]
| |
| 02:SCAN HDFS [functional.testtbl a3]
| partitions=1/1 files=0 size=0B
| predicates: a3.id IS NOT NULL
|
07:EXCHANGE [HASH(a1.id)]
|
03:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
| hash predicates: a1.id - 1 = a2.id + 1
| other join predicates: a1.zip = 94611, a2.zip = 94104
| runtime filters: RF002 <- a2.id + 1
|
|--06:EXCHANGE [HASH(a2.id + 1)]
| |
| 01:SCAN HDFS [functional.testtbl a2]
| partitions=1/1 files=0 size=0B
| predicates: a2.id = 15
|
05:EXCHANGE [HASH(a1.id - 1)]
|
00:SCAN HDFS [functional.testtbl a1]
partitions=1/1 files=0 size=0B
predicates: a1.id IS NOT NULL, a1.id > 0
runtime filters: RF000 -> a1.id, RF002 -> a1.id - 1
====
# right outer join requires the join op to be partitioned, otherwise non-matches cause
# duplicates
select a.tinyint_col, b.id
from functional.alltypesagg a
right outer join functional.alltypestiny b on (a.tinyint_col = b.id)
where a.tinyint_col is null
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [RIGHT OUTER JOIN]
| hash predicates: a.tinyint_col = b.id
| other predicates: a.tinyint_col IS NULL
| runtime filters: RF000 <- b.id
|
|--01:SCAN HDFS [functional.alltypestiny b]
| partitions=4/4 files=4 size=460B
|
00:SCAN HDFS [functional.alltypesagg a]
partitions=11/11 files=11 size=814.73KB
runtime filters: RF000 -> a.tinyint_col
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
05:EXCHANGE [UNPARTITIONED]
|
02:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
| hash predicates: a.tinyint_col = b.id
| other predicates: a.tinyint_col IS NULL
| runtime filters: RF000 <- b.id
|
|--04:EXCHANGE [HASH(b.id)]
| |
| 01:SCAN HDFS [functional.alltypestiny b]
| partitions=4/4 files=4 size=460B
|
03:EXCHANGE [HASH(a.tinyint_col)]
|
00:SCAN HDFS [functional.alltypesagg a]
partitions=11/11 files=11 size=814.73KB
runtime filters: RF000 -> a.tinyint_col
====
# same for full outer joins
select a.tinyint_col, b.id
from functional.alltypesagg a
full outer join functional.alltypestiny b on (a.tinyint_col = b.id)
where a.tinyint_col is null
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [FULL OUTER JOIN]
| hash predicates: a.tinyint_col = b.id
| other predicates: a.tinyint_col IS NULL
|
|--01:SCAN HDFS [functional.alltypestiny b]
| partitions=4/4 files=4 size=460B
|
00:SCAN HDFS [functional.alltypesagg a]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
05:EXCHANGE [UNPARTITIONED]
|
02:HASH JOIN [FULL OUTER JOIN, PARTITIONED]
| hash predicates: a.tinyint_col = b.id
| other predicates: a.tinyint_col IS NULL
|
|--04:EXCHANGE [HASH(b.id)]
| |
| 01:SCAN HDFS [functional.alltypestiny b]
| partitions=4/4 files=4 size=460B
|
03:EXCHANGE [HASH(a.tinyint_col)]
|
00:SCAN HDFS [functional.alltypesagg a]
partitions=11/11 files=11 size=814.73KB
====
# Predicate assignment when query has a full outer join (IMPALA-1371)
select *
from functional.alltypes a full outer join functional.alltypes b
# predicate on a from the on clause is not assigned to the scan node
on (a.id = b.id and a.int_col < 10 and b.tinyint_col != 5)
# predicate on b from the where clause is assigned to the scan and
# also to the full outer join
where b.bigint_col > 10
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [FULL OUTER JOIN]
| hash predicates: a.id = b.id
| other join predicates: a.int_col < 10, b.tinyint_col != 5
| other predicates: b.bigint_col > 10
|
|--01:SCAN HDFS [functional.alltypes b]
| partitions=24/24 files=24 size=478.45KB
| predicates: b.bigint_col > 10
|
00:SCAN HDFS [functional.alltypes a]
partitions=24/24 files=24 size=478.45KB
====
# Predicate assignment when query has a full outer followed by an inner join
select *
from functional.alltypes a full outer join functional.alltypes b
on (a.id = b.id)
inner join functional.alltypes c
on (a.int_col = c.int_col and a.tinyint_col < 10)
# predicate on b from the where clause is assigned to the
# first full outer join
where b.tinyint_col > 20
---- PLAN
PLAN-ROOT SINK
|
05:HASH JOIN [INNER JOIN]
| hash predicates: c.int_col = a.int_col
| other predicates: a.tinyint_col < 10
| runtime filters: RF000 <- a.int_col
|
|--04:HASH JOIN [FULL OUTER JOIN]
| | hash predicates: a.id = b.id
| | other predicates: b.tinyint_col > 20
| |
| |--01:SCAN HDFS [functional.alltypes b]
| | partitions=24/24 files=24 size=478.45KB
| | predicates: b.tinyint_col > 20
| |
| 00:SCAN HDFS [functional.alltypes a]
| partitions=24/24 files=24 size=478.45KB
| predicates: a.tinyint_col < 10
|
02:SCAN HDFS [functional.alltypes c]
partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> c.int_col
====
# Predicate assignment when query has an inner join followed by a full outer join
select *
from functional.alltypes a inner join functional.alltypes b
on (a.id = b.id)
full outer join functional.alltypes c
on (a.int_col = c.int_col and a.tinyint_col < b.tinyint_col and a.bigint_col < 10)
# predicate on a is assigned to the scan and the full outer join, predicate on a and b
# is assigned to the full outer join
# TODO Ideally, the predicate on a and b should be assigned to the inner join and then
# re-assigned to the full outer join.
where a.smallint_col = 100 and a.float_col > b.float_col
---- PLAN
PLAN-ROOT SINK
|
04:HASH JOIN [FULL OUTER JOIN]
| hash predicates: c.int_col = a.int_col
| other join predicates: a.bigint_col < 10, a.tinyint_col < b.tinyint_col
| other predicates: a.smallint_col = 100, a.float_col > b.float_col
|
|--03:HASH JOIN [INNER JOIN]
| | hash predicates: b.id = a.id
| | runtime filters: RF000 <- a.id
| |
| |--00:SCAN HDFS [functional.alltypes a]
| | partitions=24/24 files=24 size=478.45KB
| | predicates: a.smallint_col = 100
| |
| 01:SCAN HDFS [functional.alltypes b]
| partitions=24/24 files=24 size=478.45KB
| runtime filters: RF000 -> b.id
|
02:SCAN HDFS [functional.alltypes c]
partitions=24/24 files=24 size=478.45KB
====
# Predicate assignment when query has a sequence of full outer joins interleaved with
# an inner join
select *
from functional.alltypes a full outer join functional.alltypes b
on (a.id = b.id and a.int_col < 10)
inner join functional.alltypes c
on (a.tinyint_col = c.tinyint_col and b.int_col > 10)
full outer join functional.alltypes d
# predicate on b from the on clause is assigned to the scan and is also re-applied
# in the full outer join
on (a.tinyint_col = d.tinyint_col and b.int_col < 20)
# predicate on b from the where clause is assigned to the first full outer join
where a.bool_col = false and a.float_col < b.float_col
---- PLAN
PLAN-ROOT SINK
|
07:HASH JOIN [FULL OUTER JOIN]
| hash predicates: a.tinyint_col = d.tinyint_col
| other join predicates: b.int_col < 20
| other predicates: a.float_col < b.float_col, a.bool_col = FALSE
|
|--03:SCAN HDFS [functional.alltypes d]
| partitions=24/24 files=24 size=478.45KB
|
06:HASH JOIN [INNER JOIN]
| hash predicates: a.tinyint_col = c.tinyint_col
| other predicates: b.int_col > 10
| runtime filters: RF000 <- c.tinyint_col
|
|--02:SCAN HDFS [functional.alltypes c]
| partitions=24/24 files=24 size=478.45KB
|
05:HASH JOIN [FULL OUTER JOIN]
| hash predicates: a.id = b.id
| other join predicates: a.int_col < 10
|
|--01:SCAN HDFS [functional.alltypes b]
| partitions=24/24 files=24 size=478.45KB
| predicates: b.int_col > 10
|
00:SCAN HDFS [functional.alltypes a]
partitions=24/24 files=24 size=478.45KB
predicates: a.bool_col = FALSE
runtime filters: RF000 -> a.tinyint_col
====
# Predicate assignment through an inline-view that has a full outer join
select 1
from (
select a.tinyint_col x, b.tinyint_col y
from functional.alltypes a
full outer join functional.alltypes b
on (a.id = b.id)
inner join functional.alltypes c
# predicate on a and b from the on clause of the inner join is assigned to
# the first full outer join
on (b.smallint_col = c.smallint_col and a.int_col < b.int_col and b.id < 10)
full outer join functional.alltypes d
# predicate on a and b from the on clause of the full outer join is assigned
# to the full outer join
on (c.id = d.id and a.bigint_col > b.bigint_col)) v
# predicate not specified in the scope of the inline view is assigned to the last
# full outer join of the inline view that materializes the corresponding
# tuple ids
where x != y
---- PLAN
PLAN-ROOT SINK
|
06:HASH JOIN [FULL OUTER JOIN]
| hash predicates: c.id = d.id
| other join predicates: a.bigint_col > b.bigint_col
| other predicates: a.tinyint_col != b.tinyint_col
|
|--03:SCAN HDFS [functional.alltypes d]
| partitions=24/24 files=24 size=478.45KB
|
05:HASH JOIN [INNER JOIN]
| hash predicates: b.smallint_col = c.smallint_col
| other predicates: a.int_col < b.int_col, b.id < 10
| runtime filters: RF000 <- c.smallint_col
|
|--02:SCAN HDFS [functional.alltypes c]
| partitions=24/24 files=24 size=478.45KB
|
04:HASH JOIN [FULL OUTER JOIN]
| hash predicates: a.id = b.id
|
|--01:SCAN HDFS [functional.alltypes b]
| partitions=24/24 files=24 size=478.45KB
| predicates: b.id < 10
| runtime filters: RF000 -> b.smallint_col
|
00:SCAN HDFS [functional.alltypes a]
partitions=24/24 files=24 size=478.45KB
====
# Predicate assignment when query has an inner join followed by full outer join
select 1
from functional.alltypes a
inner join functional.alltypes b
on (a.id = b.id)
full outer join functional.alltypes c
on (b.int_col = c.int_col and c.int_col < 10)
# predicates on a and b from the where clause are assigned to the last full outer join
# that materializes the corresponding tuple ids
where a.bigint_col = b.bigint_col and a.tinyint_col < b.tinyint_col
---- PLAN
PLAN-ROOT SINK
|
04:HASH JOIN [FULL OUTER JOIN]
| hash predicates: b.int_col = c.int_col
| other join predicates: c.int_col < 10
| other predicates: a.bigint_col = b.bigint_col, a.tinyint_col < b.tinyint_col
|
|--02:SCAN HDFS [functional.alltypes c]
| partitions=24/24 files=24 size=478.45KB
|
03:HASH JOIN [INNER JOIN]
| hash predicates: b.id = a.id
| runtime filters: RF000 <- a.id
|
|--00:SCAN HDFS [functional.alltypes a]
| partitions=24/24 files=24 size=478.45KB
|
01:SCAN HDFS [functional.alltypes b]
partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> b.id
====
# Predicate assignment when query has a sequence of inner, outer and full
# outer joins
select 1
from functional.alltypes a
inner join functional.alltypes b
on (a.id = b.id)
left outer join functional.alltypes c
on (a.int_col = c.int_col and c.tinyint_col = 10)
full outer join functional.alltypes d
on (b.string_col = d.string_col and a.tinyint_col < b.tinyint_col)
where a.float_col = b.float_col and b.smallint_col = 1 and d.tinyint_col < 10
---- PLAN
PLAN-ROOT SINK
|
06:HASH JOIN [FULL OUTER JOIN]
| hash predicates: b.string_col = d.string_col
| other join predicates: a.tinyint_col < b.tinyint_col
| other predicates: b.smallint_col = 1, a.float_col = b.float_col, d.tinyint_col < 10
|
|--03:SCAN HDFS [functional.alltypes d]
| partitions=24/24 files=24 size=478.45KB
| predicates: d.tinyint_col < 10
|
05:HASH JOIN [RIGHT OUTER JOIN]
| hash predicates: c.int_col = a.int_col
| other join predicates: c.tinyint_col = 10
| runtime filters: RF000 <- a.int_col
|
|--04:HASH JOIN [INNER JOIN]
| | hash predicates: a.id = b.id
| | runtime filters: RF002 <- b.id
| |
| |--01:SCAN HDFS [functional.alltypes b]
| | partitions=24/24 files=24 size=478.45KB
| | predicates: b.smallint_col = 1
| |
| 00:SCAN HDFS [functional.alltypes a]
| partitions=24/24 files=24 size=478.45KB
| runtime filters: RF002 -> a.id
|
02:SCAN HDFS [functional.alltypes c]
partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> c.int_col
====
# IMPALA-2144: Test correct assignment of Having-clause predicates
# referencing a grouping column coming from an outer-joined table ref.
select b.int_col, sum(b.double_col)
from functional.alltypes a left outer join functional.alltypestiny b
on (a.id = b.id)
group by a.bool_col, a.int_col, b.bool_col, b.int_col
having a.bool_col is null and a.int_col is not null
and b.bool_col is null and b.int_col is not null
---- PLAN
PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: sum(b.double_col)
| group by: a.bool_col, a.int_col, b.bool_col, b.int_col
| having: b.bool_col IS NULL, b.int_col IS NOT NULL
|
02:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: a.id = b.id
|
|--01:SCAN HDFS [functional.alltypestiny b]
| partitions=4/4 files=4 size=460B
| predicates: b.int_col IS NOT NULL
|
00:SCAN HDFS [functional.alltypes a]
partitions=24/24 files=24 size=478.45KB
predicates: a.bool_col IS NULL, a.int_col IS NOT NULL
====
# IMPALA-2144: Same as above but with a full outer join.
select b.int_col, sum(b.double_col)
from functional.alltypes a full outer join functional.alltypestiny b
on (a.id = b.id)
group by a.bool_col, a.int_col, b.bool_col, b.int_col
having a.bool_col is null and a.int_col is not null
and b.bool_col is null and b.int_col is not null
---- PLAN
PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: sum(b.double_col)
| group by: a.bool_col, a.int_col, b.bool_col, b.int_col
| having: a.bool_col IS NULL, a.int_col IS NOT NULL, b.bool_col IS NULL, b.int_col IS NOT NULL
|
02:HASH JOIN [FULL OUTER JOIN]
| hash predicates: a.id = b.id
|
|--01:SCAN HDFS [functional.alltypestiny b]
| partitions=4/4 files=4 size=460B
| predicates: b.int_col IS NOT NULL
|
00:SCAN HDFS [functional.alltypes a]
partitions=24/24 files=24 size=478.45KB
predicates: a.int_col IS NOT NULL
====
# IMPALA-2765: Predicate assignment when outer joins have case expressions as predicates
select *
from functional.alltypes a
left outer join functional.alltypes b
on (a.id = b.id)
where (
case when a.id > 100 then
a.timestamp_col
else
a.timestamp_col
end) >= cast('2001-01-01 00:00:00' as timestamp);
---- PLAN
PLAN-ROOT SINK
|
02:HASH JOIN [RIGHT OUTER JOIN]
| hash predicates: b.id = a.id
| runtime filters: RF000 <- a.id
|
|--00:SCAN HDFS [functional.alltypes a]
| partitions=24/24 files=24 size=478.45KB
| predicates: (CASE WHEN a.id > 100 THEN a.timestamp_col ELSE a.timestamp_col END) >= TIMESTAMP '2001-01-01 00:00:00'
|
01:SCAN HDFS [functional.alltypes b]
partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> b.id
====
# IMPALA-3071: Test correct assignment of non-join conjuncts belonging to the On-clause
# of an inner join that follows an outer join.
select 1 from functional.alltypestiny a
left outer join functional.alltypestiny b
on a.id = b.id
inner join functional.alltypestiny c
on b.id = c.id and b.int_col < 0 and a.int_col > 10
---- PLAN
PLAN-ROOT SINK
|
05:HASH JOIN [INNER JOIN]
| hash predicates: c.id = b.id
| other predicates: b.int_col < 0
| runtime filters: RF000 <- b.id
|
|--04:HASH JOIN [LEFT OUTER JOIN]
| | hash predicates: a.id = b.id
| |
| |--01:SCAN HDFS [functional.alltypestiny b]
| | partitions=4/4 files=4 size=460B
| | predicates: b.int_col < 0
| |
| 00:SCAN HDFS [functional.alltypestiny a]
| partitions=4/4 files=4 size=460B
| predicates: a.int_col > 10
|
02:SCAN HDFS [functional.alltypestiny c]
partitions=4/4 files=4 size=460B
runtime filters: RF000 -> c.id
====
# IMPALA-3071: Same as above but with a right outer join.
select 1 from functional.alltypestiny a
right outer join functional.alltypestiny b
on a.id = b.id
inner join functional.alltypestiny c
on b.id = c.id and b.int_col < 0 and a.int_col > 10
---- PLAN
PLAN-ROOT SINK
|
05:HASH JOIN [INNER JOIN]
| hash predicates: c.id = b.id
| other predicates: a.int_col > 10
| runtime filters: RF000 <- b.id
|
|--04:HASH JOIN [RIGHT OUTER JOIN]
| | hash predicates: a.id = b.id
| | runtime filters: RF002 <- b.id
| |
| |--01:SCAN HDFS [functional.alltypestiny b]
| | partitions=4/4 files=4 size=460B
| | predicates: b.int_col < 0
| |
| 00:SCAN HDFS [functional.alltypestiny a]
| partitions=4/4 files=4 size=460B
| predicates: a.int_col > 10
| runtime filters: RF002 -> a.id
|
02:SCAN HDFS [functional.alltypestiny c]
partitions=4/4 files=4 size=460B
runtime filters: RF000 -> c.id
====
# IMPALA-3071: Same as above but with a full outer join.
select 1 from functional.alltypestiny a
full outer join functional.alltypestiny b
on a.id = b.id
inner join functional.alltypestiny c
on b.id = c.id and b.int_col < 0 and a.int_col > 10
---- PLAN
PLAN-ROOT SINK
|
05:HASH JOIN [INNER JOIN]
| hash predicates: b.id = c.id
| other predicates: a.int_col > 10, b.int_col < 0
| runtime filters: RF000 <- c.id
|
|--02:SCAN HDFS [functional.alltypestiny c]
| partitions=4/4 files=4 size=460B
|
04:HASH JOIN [FULL OUTER JOIN]
| hash predicates: a.id = b.id
|
|--01:SCAN HDFS [functional.alltypestiny b]
| partitions=4/4 files=4 size=460B
| predicates: b.int_col < 0
| runtime filters: RF000 -> b.id
|
00:SCAN HDFS [functional.alltypestiny a]
partitions=4/4 files=4 size=460B
predicates: a.int_col > 10
====
# IMPALA-3071: Test that the inner join On-clause predicate is placed
# precisely at the correct outer join (not before or after).
select 1 from functional.alltypestiny a
left outer join functional.alltypestiny b
on a.id = b.id
left outer join functional.alltypestiny c
on b.id = c.id
inner join functional.alltypestiny d
on b.int_col < 0 and a.int_col > 10
full outer join functional.alltypestiny e
on d.id = e.id
---- PLAN
PLAN-ROOT SINK
|
09:HASH JOIN [FULL OUTER JOIN]
| hash predicates: e.id = d.id
|
|--08:NESTED LOOP JOIN [INNER JOIN]
| | predicates: a.int_col > 10, b.int_col < 0
| |
| |--07:HASH JOIN [RIGHT OUTER JOIN]
| | | hash predicates: c.id = b.id
| | | runtime filters: RF000 <- b.id
| | |
| | |--06:HASH JOIN [LEFT OUTER JOIN]
| | | | hash predicates: a.id = b.id
| | | |
| | | |--01:SCAN HDFS [functional.alltypestiny b]
| | | | partitions=4/4 files=4 size=460B
| | | | predicates: b.int_col < 0
| | | |
| | | 00:SCAN HDFS [functional.alltypestiny a]
| | | partitions=4/4 files=4 size=460B
| | | predicates: a.int_col > 10
| | |
| | 02:SCAN HDFS [functional.alltypestiny c]
| | partitions=4/4 files=4 size=460B
| | runtime filters: RF000 -> c.id
| |
| 03:SCAN HDFS [functional.alltypestiny d]
| partitions=4/4 files=4 size=460B
|
04:SCAN HDFS [functional.alltypestiny e]
partitions=4/4 files=4 size=460B
====
# IMPALA-3125: Test that the On-clause predicates from an outer join are assigned to the
# corresponding outer-join node, even if the predicates do not reference the join rhs.
select a.id aid, b.id bid, a.int_col aint, b.int_col bint
from functional.alltypes a
inner join functional.alltypes b
on a.int_col = b.int_col
left outer join functional.alltypes c
on a.id = b.id and b.bigint_col = c.bigint_col
---- PLAN
PLAN-ROOT SINK
|
04:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: b.bigint_col = c.bigint_col
| other join predicates: a.id = b.id
|
|--02:SCAN HDFS [functional.alltypes c]
| partitions=24/24 files=24 size=478.45KB
|
03:HASH JOIN [INNER JOIN]
| hash predicates: b.int_col = a.int_col
| runtime filters: RF000 <- a.int_col
|
|--00:SCAN HDFS [functional.alltypes a]
| partitions=24/24 files=24 size=478.45KB
|
01:SCAN HDFS [functional.alltypes b]
partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> b.int_col
====
# IMPALA-3167: Test correct assignment of a WHERE-clause predicate through an inline view
# that has a grouping aggregation and an outer join. The predicate can be assigned at the
# scan on the nullable side of the outer join, but it must also be evaluated after the join.
select v2.id, v2.s
from (select v1.id, sum(bigint_col) s
from functional.alltypes t1
left outer join (select t2.int_col, t2.id
from functional.alltypessmall t2) v1
on t1.int_col = v1.int_col
group by v1.id) v2
where v2.id < 10
---- PLAN
PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: sum(bigint_col)
| group by: t2.id
| having: v1.id < 10
|
02:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: t1.int_col = t2.int_col
|
|--01:SCAN HDFS [functional.alltypessmall t2]
| partitions=4/4 files=4 size=6.32KB
| predicates: t2.id < 10
|
00:SCAN HDFS [functional.alltypes t1]
partitions=24/24 files=24 size=478.45KB
====
# IMPALA-3126: Test assignment of an inner join On-clause predicate. The predicate
# may not be assigned below the join materializing 'd'.
select 1 from functional.alltypes a
left outer join functional.alltypes b
on a.id = b.id
right outer join functional.alltypes c
on b.id = c.id
inner join functional.alltypes d
on a.int_col = b.int_col
---- PLAN
PLAN-ROOT SINK
|
06:NESTED LOOP JOIN [INNER JOIN]
| predicates: a.int_col = b.int_col
|
|--03:SCAN HDFS [functional.alltypes d]
| partitions=24/24 files=24 size=478.45KB
|
05:HASH JOIN [RIGHT OUTER JOIN]
| hash predicates: b.id = c.id
| runtime filters: RF000 <- c.id
|
|--02:SCAN HDFS [functional.alltypes c]
| partitions=24/24 files=24 size=478.45KB
|
04:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: a.id = b.id
|
|--01:SCAN HDFS [functional.alltypes b]
| partitions=24/24 files=24 size=478.45KB
| runtime filters: RF000 -> b.id
|
00:SCAN HDFS [functional.alltypes a]
partitions=24/24 files=24 size=478.45KB
====
# IMPALA-3126: Same as above but with a semi join at the end.
select 1 from functional.alltypes a
left outer join functional.alltypes b
on a.id = b.id
right outer join functional.alltypes c
on b.id = c.id
left semi join functional.alltypes d
on a.int_col = b.int_col
---- PLAN
PLAN-ROOT SINK
|
06:NESTED LOOP JOIN [LEFT SEMI JOIN]
| join predicates: a.int_col = b.int_col
|
|--03:SCAN HDFS [functional.alltypes d]
| partitions=24/24 files=24 size=478.45KB
|
05:HASH JOIN [RIGHT OUTER JOIN]
| hash predicates: b.id = c.id
| runtime filters: RF000 <- c.id
|
|--02:SCAN HDFS [functional.alltypes c]
| partitions=24/24 files=24 size=478.45KB
|
04:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: a.id = b.id
|
|--01:SCAN HDFS [functional.alltypes b]
| partitions=24/24 files=24 size=478.45KB
| runtime filters: RF000 -> b.id
|
00:SCAN HDFS [functional.alltypes a]
partitions=24/24 files=24 size=478.45KB
====
# IMPALA-5856: Test correct assignment of a join predicate with the following properties:
# - from the On-clause of a left outer join
# - only references the left-hand side tuples (not the right hand side tuple)
# - references full-outer joined tuples; the full outer join appears on the left
select * from functional.alltypes t1
full outer join functional.alltypessmall t2
on t1.id = t2.id
left outer join functional.alltypestiny t3
on coalesce(t1.id, t2.id) = t3.id and coalesce(t1.int_col, t2.int_col) = 2
and t1.bigint_col > 10 and t2.bigint_col > 30
where concat(t1.string_col, t2.string_col) = 'test1' and t3.string_col = 'test2'
---- PLAN
PLAN-ROOT SINK
|
04:HASH JOIN [LEFT OUTER JOIN]
| hash predicates: coalesce(t1.id, t2.id) = t3.id
| other join predicates: t1.bigint_col > 10, t2.bigint_col > 30, coalesce(t1.int_col, t2.int_col) = 2
| other predicates: t3.string_col = 'test2'
|
|--02:SCAN HDFS [functional.alltypestiny t3]
| partitions=4/4 files=4 size=460B
| predicates: t3.string_col = 'test2'
|
03:HASH JOIN [FULL OUTER JOIN]
| hash predicates: t1.id = t2.id
| other predicates: concat(t1.string_col, t2.string_col) = 'test1'
|
|--01:SCAN HDFS [functional.alltypessmall t2]
| partitions=4/4 files=4 size=6.32KB
|
00:SCAN HDFS [functional.alltypes t1]
partitions=24/24 files=24 size=478.45KB
====
# IMPALA-6123: Test the partition compatibility check with consecutive outer joins.
# In this query the output partition of the right outer join should be the rhs partition
# t3.id, which is not compatible with the aggregation partition t2.id. There should be
# an exchange node in the aggregation.
# The first left outer join is used to ensure 't2' is made nullable before the fragment
# executing the right outer join.
select /* +straight_join */ t2.id, count(*)
from functional.alltypes t1
left outer join /* +shuffle */ functional.alltypessmall t2
on t1.int_col = t2.int_col
right outer join /* +shuffle */ functional.alltypestiny t3
on t2.id = t3.id
group by t2.id
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
12:EXCHANGE [UNPARTITIONED]
|
11:AGGREGATE [FINALIZE]
| output: count:merge(*)
| group by: t2.id
|
10:EXCHANGE [HASH(t2.id)]
|
05:AGGREGATE [STREAMING]
| output: count(*)
| group by: t2.id
|
04:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
| hash predicates: t2.id = t3.id
| runtime filters: RF000 <- t3.id
|
|--09:EXCHANGE [HASH(t3.id)]
| |
| 02:SCAN HDFS [functional.alltypestiny t3]
| partitions=4/4 files=4 size=460B
|
08:EXCHANGE [HASH(t2.id)]
|
03:HASH JOIN [LEFT OUTER JOIN, PARTITIONED]
| hash predicates: t1.int_col = t2.int_col
|
|--07:EXCHANGE [HASH(t2.int_col)]
| |
| 01:SCAN HDFS [functional.alltypessmall t2]
| partitions=4/4 files=4 size=6.32KB
| runtime filters: RF000 -> t2.id
|
06:EXCHANGE [HASH(t1.int_col)]
|
00:SCAN HDFS [functional.alltypes t1]
partitions=24/24 files=24 size=478.45KB
====
# IMPALA-6123: Test the partition compatibility check with consecutive outer joins.
# In this query the output partition of the full outer join should be random. There should
# be an exchange node in the aggregation.
# The first right outer join is used to ensure 't1' is made nullable before the fragment
# executing the full outer join.
select /* +straight_join */ t1.int_col, count(*)
from functional.alltypes t1
right outer join /* +shuffle */ functional.alltypes t2
on t1.smallint_col = t2.smallint_col
full outer join /* +shuffle */ functional.alltypes t3
on t1.int_col = t3.int_col
group by t1.int_col
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
12:EXCHANGE [UNPARTITIONED]
|
11:AGGREGATE [FINALIZE]
| output: count:merge(*)
| group by: t1.int_col
|
10:EXCHANGE [HASH(t1.int_col)]
|
05:AGGREGATE [STREAMING]
| output: count(*)
| group by: t1.int_col
|
04:HASH JOIN [FULL OUTER JOIN, PARTITIONED]
| hash predicates: t1.int_col = t3.int_col
|
|--09:EXCHANGE [HASH(t3.int_col)]
| |
| 02:SCAN HDFS [functional.alltypes t3]
| partitions=24/24 files=24 size=478.45KB
|
08:EXCHANGE [HASH(t1.int_col)]
|
03:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
| hash predicates: t1.smallint_col = t2.smallint_col
| runtime filters: RF000 <- t2.smallint_col
|
|--07:EXCHANGE [HASH(t2.smallint_col)]
| |
| 01:SCAN HDFS [functional.alltypes t2]
| partitions=24/24 files=24 size=478.45KB
|
06:EXCHANGE [HASH(t1.smallint_col)]
|
00:SCAN HDFS [functional.alltypes t1]
partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> t1.smallint_col
====