mirror of
https://github.com/apache/impala.git
synced 2026-01-22 18:02:34 -05:00
When generating plans with left semi/anti joins (typically resulting from subquery rewrites), the planner now considers inserting a distinct aggregation on the inner side of the join. The decision is based on whether that aggregation would reduce the number of rows by more than 75%. This is fairly conservative and the optimization might be beneficial for smaller reductions, but the conservative threshold is chosen to reduce the number of potential plan regressions. The aggregation can both reduce the # of rows and the width of the rows, by projecting out unneeded slots. ENABLE_DISTINCT_SEMI_JOIN_OPTIMIZATION query option is added to allow toggling the optimization. Tests: * Add positive and negative planner tests for various cases - including semi/anti joins, missing stats, broadcast/shuffle, different numbers of join predicates. * Add some end-to-end tests to verify plans execute correctly. Change-Id: Icbb955e805d9e764edf11c57b98f341b88a37fcc Reviewed-on: http://gerrit.cloudera.org:8080/16180 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
331 lines
10 KiB
Plaintext
331 lines
10 KiB
Plaintext
# Right outer joins with non-equi join predicates
|
|
select straight_join *
|
|
from functional.alltypestiny a right outer join functional.alltypes b
|
|
on a.id != b.id or a.int_col < b.int_col
|
|
right outer join functional.alltypesagg c
|
|
on a.smallint_col >= c.smallint_col
|
|
where a.id < 10 and c.bigint_col = 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
04:NESTED LOOP JOIN [RIGHT OUTER JOIN]
|
|
| join predicates: a.smallint_col >= c.smallint_col
|
|
| predicates: a.id < 10
|
|
| row-size=273B cardinality=7.30K
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypesagg c]
|
|
| HDFS partitions=11/11 files=11 size=814.73KB
|
|
| predicates: c.bigint_col = 10
|
|
| row-size=95B cardinality=11
|
|
|
|
|
03:NESTED LOOP JOIN [RIGHT OUTER JOIN]
|
|
| join predicates: a.id != b.id OR a.int_col < b.int_col
|
|
| row-size=178B cardinality=7.30K
|
|
|
|
|
|--01:SCAN HDFS [functional.alltypes b]
|
|
| HDFS partitions=24/24 files=24 size=478.45KB
|
|
| row-size=89B cardinality=7.30K
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny a]
|
|
HDFS partitions=4/4 files=4 size=460B
|
|
predicates: a.id < 10
|
|
row-size=89B cardinality=1
|
|
---- DISTRIBUTEDPLAN
|
|
NotImplementedException: Error generating a valid execution plan for this query. A RIGHT OUTER JOIN type with no equi-join predicates can only be executed with a single node plan.
|
|
====
|
|
# Right semi joins with non-equi join predicates
|
|
select straight_join *
|
|
from functional.alltypestiny a right semi join functional.alltypessmall c
|
|
on a.tinyint_col > c.tinyint_col
|
|
right semi join functional.alltypesagg d
|
|
on c.tinyint_col < d.bigint_col
|
|
where d.bigint_col < 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
04:NESTED LOOP JOIN [RIGHT SEMI JOIN]
|
|
| join predicates: c.tinyint_col < d.bigint_col
|
|
| row-size=95B cardinality=1.10K
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypesagg d]
|
|
| HDFS partitions=11/11 files=11 size=814.73KB
|
|
| predicates: d.bigint_col < 10
|
|
| row-size=95B cardinality=1.10K
|
|
|
|
|
03:NESTED LOOP JOIN [RIGHT SEMI JOIN]
|
|
| join predicates: a.tinyint_col > c.tinyint_col
|
|
| row-size=1B cardinality=100
|
|
|
|
|
|--01:SCAN HDFS [functional.alltypessmall c]
|
|
| HDFS partitions=4/4 files=4 size=6.32KB
|
|
| row-size=1B cardinality=100
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny a]
|
|
HDFS partitions=4/4 files=4 size=460B
|
|
row-size=1B cardinality=8
|
|
---- DISTRIBUTEDPLAN
|
|
NotImplementedException: Error generating a valid execution plan for this query. A RIGHT SEMI JOIN type with no equi-join predicates can only be executed with a single node plan.
|
|
====
|
|
# Full outer joins with non-equi join predicates
|
|
select straight_join *
|
|
from functional.alltypestiny a full outer join functional.alltypessmall b
|
|
on a.id != b.id or a.int_col != b.int_col
|
|
full outer join functional.alltypesagg c
|
|
on a.tinyint_col > c.tinyint_col
|
|
full outer join functional.alltypes d
|
|
on c.int_col > d.int_col
|
|
where a.bigint_col != c.bigint_col and a.id < 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:NESTED LOOP JOIN [FULL OUTER JOIN]
|
|
| join predicates: c.int_col > d.int_col
|
|
| predicates: a.bigint_col != c.bigint_col, a.id < 10
|
|
| row-size=362B cardinality=18.40K
|
|
|
|
|
|--03:SCAN HDFS [functional.alltypes d]
|
|
| HDFS partitions=24/24 files=24 size=478.45KB
|
|
| row-size=89B cardinality=7.30K
|
|
|
|
|
05:NESTED LOOP JOIN [FULL OUTER JOIN]
|
|
| join predicates: a.tinyint_col > c.tinyint_col
|
|
| row-size=273B cardinality=11.10K
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypesagg c]
|
|
| HDFS partitions=11/11 files=11 size=814.73KB
|
|
| row-size=95B cardinality=11.00K
|
|
|
|
|
04:NESTED LOOP JOIN [FULL OUTER JOIN]
|
|
| join predicates: a.id != b.id OR a.int_col != b.int_col
|
|
| row-size=178B cardinality=101
|
|
|
|
|
|--01:SCAN HDFS [functional.alltypessmall b]
|
|
| HDFS partitions=4/4 files=4 size=6.32KB
|
|
| row-size=89B cardinality=100
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny a]
|
|
HDFS partitions=4/4 files=4 size=460B
|
|
predicates: a.id < 10
|
|
row-size=89B cardinality=1
|
|
---- DISTRIBUTEDPLAN
|
|
NotImplementedException: Error generating a valid execution plan for this query. A FULL OUTER JOIN type with no equi-join predicates can only be executed with a single node plan.
|
|
====
|
|
# Right anti join with non-equi join predicates
|
|
select straight_join count(*)
|
|
from functional.alltypestiny a right anti join functional.alltypessmall b
|
|
on a.id < b.id
|
|
where b.int_col = 5
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
02:NESTED LOOP JOIN [RIGHT ANTI JOIN]
|
|
| join predicates: a.id < b.id
|
|
| row-size=8B cardinality=10
|
|
|
|
|
|--01:SCAN HDFS [functional.alltypessmall b]
|
|
| HDFS partitions=4/4 files=4 size=6.32KB
|
|
| predicates: b.int_col = 5
|
|
| row-size=8B cardinality=10
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny a]
|
|
HDFS partitions=4/4 files=4 size=460B
|
|
row-size=4B cardinality=8
|
|
---- DISTRIBUTEDPLAN
|
|
NotImplementedException: Error generating a valid execution plan for this query. A RIGHT ANTI JOIN type with no equi-join predicates can only be executed with a single node plan.
|
|
====
|
|
# Inner and right joins with non-equi join predicates
|
|
select straight_join count(*)
|
|
from functional.alltypestiny a inner join functional.alltypessmall b on a.id < b.id
|
|
right outer join functional.alltypesagg c on a.int_col != c.int_col
|
|
right semi join functional.alltypes d on c.tinyint_col < d.tinyint_col
|
|
right anti join functional.alltypesnopart e on d.tinyint_col > e.tinyint_col
|
|
where e.id < 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
09:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
08:NESTED LOOP JOIN [RIGHT ANTI JOIN]
|
|
| join predicates: d.tinyint_col > e.tinyint_col
|
|
| row-size=5B cardinality=0
|
|
|
|
|
|--04:SCAN HDFS [functional.alltypesnopart e]
|
|
| HDFS partitions=1/1 files=0 size=0B
|
|
| predicates: e.id < 10
|
|
| row-size=5B cardinality=0
|
|
|
|
|
07:NESTED LOOP JOIN [RIGHT SEMI JOIN]
|
|
| join predicates: c.tinyint_col < d.tinyint_col
|
|
| row-size=1B cardinality=7.30K
|
|
|
|
|
|--03:SCAN HDFS [functional.alltypes d]
|
|
| HDFS partitions=24/24 files=24 size=478.45KB
|
|
| row-size=1B cardinality=7.30K
|
|
|
|
|
06:NESTED LOOP JOIN [RIGHT OUTER JOIN]
|
|
| join predicates: a.int_col != c.int_col
|
|
| row-size=17B cardinality=11.00K
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypesagg c]
|
|
| HDFS partitions=11/11 files=11 size=814.73KB
|
|
| row-size=5B cardinality=11.00K
|
|
|
|
|
05:NESTED LOOP JOIN [INNER JOIN]
|
|
| predicates: a.id < b.id
|
|
| row-size=12B cardinality=8
|
|
|
|
|
|--01:SCAN HDFS [functional.alltypessmall b]
|
|
| HDFS partitions=4/4 files=4 size=6.32KB
|
|
| row-size=4B cardinality=100
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny a]
|
|
HDFS partitions=4/4 files=4 size=460B
|
|
row-size=8B cardinality=8
|
|
---- DISTRIBUTEDPLAN
|
|
NotImplementedException: Error generating a valid execution plan for this query. A RIGHT ANTI JOIN type with no equi-join predicates can only be executed with a single node plan.
|
|
====
|
|
# Right semi and outer joins are inverted to make them executable.
|
|
# Same query as above but without the straight join hint.
|
|
select count(*)
|
|
from functional.alltypestiny a inner join functional.alltypessmall b on a.id < b.id
|
|
right outer join functional.alltypesagg c on a.int_col != c.int_col
|
|
right semi join functional.alltypes d on c.tinyint_col < d.tinyint_col
|
|
right anti join functional.alltypesnopart e on d.tinyint_col > e.tinyint_col
|
|
where e.id < 10
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
15:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
14:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
09:AGGREGATE
|
|
| output: count(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
08:NESTED LOOP JOIN [LEFT ANTI JOIN, BROADCAST]
|
|
| join predicates: d.tinyint_col > e.tinyint_col
|
|
| row-size=5B cardinality=0
|
|
|
|
|
|--13:EXCHANGE [BROADCAST]
|
|
| |
|
|
| 07:NESTED LOOP JOIN [LEFT SEMI JOIN, BROADCAST]
|
|
| | join predicates: c.tinyint_col < d.tinyint_col
|
|
| | row-size=1B cardinality=7.30K
|
|
| |
|
|
| |--12:EXCHANGE [BROADCAST]
|
|
| | |
|
|
| | 06:NESTED LOOP JOIN [LEFT OUTER JOIN, BROADCAST]
|
|
| | | join predicates: a.int_col != c.int_col
|
|
| | | row-size=17B cardinality=11.00K
|
|
| | |
|
|
| | |--11:EXCHANGE [BROADCAST]
|
|
| | | |
|
|
| | | 05:NESTED LOOP JOIN [INNER JOIN, BROADCAST]
|
|
| | | | predicates: a.id < b.id
|
|
| | | | row-size=12B cardinality=100
|
|
| | | |
|
|
| | | |--10:EXCHANGE [BROADCAST]
|
|
| | | | |
|
|
| | | | 00:SCAN HDFS [functional.alltypestiny a]
|
|
| | | | HDFS partitions=4/4 files=4 size=460B
|
|
| | | | row-size=8B cardinality=8
|
|
| | | |
|
|
| | | 01:SCAN HDFS [functional.alltypessmall b]
|
|
| | | HDFS partitions=4/4 files=4 size=6.32KB
|
|
| | | row-size=4B cardinality=100
|
|
| | |
|
|
| | 02:SCAN HDFS [functional.alltypesagg c]
|
|
| | HDFS partitions=11/11 files=11 size=814.73KB
|
|
| | row-size=5B cardinality=11.00K
|
|
| |
|
|
| 03:SCAN HDFS [functional.alltypes d]
|
|
| HDFS partitions=24/24 files=24 size=478.45KB
|
|
| row-size=1B cardinality=7.30K
|
|
|
|
|
04:SCAN HDFS [functional.alltypesnopart e]
|
|
HDFS partitions=1/1 files=0 size=0B
|
|
predicates: e.id < 10
|
|
row-size=5B cardinality=0
|
|
====
|
|
# IMPALA-5689: Do not invert a left outer join with no equi-join predicates.
|
|
select * from (
|
|
select distinct int_col
|
|
from functional.alltypes) t1
|
|
left outer join functional.alltypes t2 on (t2.bigint_col=5)
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
07:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:NESTED LOOP JOIN [LEFT OUTER JOIN, BROADCAST]
|
|
| row-size=93B cardinality=10
|
|
|
|
|
|--06:EXCHANGE [BROADCAST]
|
|
| |
|
|
| 02:SCAN HDFS [functional.alltypes t2]
|
|
| HDFS partitions=24/24 files=24 size=478.45KB
|
|
| predicates: (t2.bigint_col = 5)
|
|
| row-size=89B cardinality=730
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| group by: int_col
|
|
| row-size=4B cardinality=10
|
|
|
|
|
04:EXCHANGE [HASH(int_col)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| group by: int_col
|
|
| row-size=4B cardinality=10
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
HDFS partitions=24/24 files=24 size=478.45KB
|
|
row-size=4B cardinality=7.30K
|
|
====
|
|
# IMPALA-5689: Do not invert a left semi join with no equi-join predicates.
|
|
select * from (
|
|
select distinct int_col
|
|
from functional.alltypes) t1
|
|
left semi join functional.alltypes t2 on (t2.bigint_col=5)
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
08:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:NESTED LOOP JOIN [LEFT SEMI JOIN, BROADCAST]
|
|
| row-size=4B cardinality=10
|
|
|
|
|
|--07:EXCHANGE [BROADCAST]
|
|
| |
|
|
| 06:EXCHANGE [UNPARTITIONED]
|
|
| | limit: 1
|
|
| |
|
|
| 02:SCAN HDFS [functional.alltypes t2]
|
|
| HDFS partitions=24/24 files=24 size=478.45KB
|
|
| predicates: (t2.bigint_col = 5)
|
|
| limit: 1
|
|
| row-size=8B cardinality=730
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| group by: int_col
|
|
| row-size=4B cardinality=10
|
|
|
|
|
04:EXCHANGE [HASH(int_col)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| group by: int_col
|
|
| row-size=4B cardinality=10
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
HDFS partitions=24/24 files=24 size=478.45KB
|
|
row-size=4B cardinality=7.30K
|
|
====
|