Files
impala/testdata/workloads/functional-planner/queries/PlannerTest/nested-loop-join.test
Tim Armstrong 63f5e8ec00 IMPALA-1270: add distinct aggregation to semi joins
When generating plans with left semi/anti joins (typically
resulting from subquery rewrites), the planner now
considers inserting a distinct aggregation on the inner
side of the join. The decision is based on whether that
aggregation would reduce the number of rows by more than
75%. This is fairly conservative and the optimization
might be beneficial for smaller reductions, but the
conservative threshold is chosen to reduce the number
of potential plan regressions.

The aggregation can both reduce the # of rows and the
width of the rows, by projecting out unneeded slots.

ENABLE_DISTINCT_SEMI_JOIN_OPTIMIZATION query option is
added to allow toggling the optimization.

Tests:
* Add positive and negative planner tests for various
  cases - including semi/anti joins, missing stats,
  broadcast/shuffle, different numbers of join predicates.
* Add some end-to-end tests to verify plans execute correctly.

Change-Id: Icbb955e805d9e764edf11c57b98f341b88a37fcc
Reviewed-on: http://gerrit.cloudera.org:8080/16180
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2020-07-15 17:10:50 +00:00

331 lines
10 KiB
Plaintext

# Right outer joins with non-equi join predicates
select straight_join *
from functional.alltypestiny a right outer join functional.alltypes b
on a.id != b.id or a.int_col < b.int_col
right outer join functional.alltypesagg c
on a.smallint_col >= c.smallint_col
where a.id < 10 and c.bigint_col = 10
---- PLAN
PLAN-ROOT SINK
|
04:NESTED LOOP JOIN [RIGHT OUTER JOIN]
| join predicates: a.smallint_col >= c.smallint_col
| predicates: a.id < 10
| row-size=273B cardinality=7.30K
|
|--02:SCAN HDFS [functional.alltypesagg c]
| HDFS partitions=11/11 files=11 size=814.73KB
| predicates: c.bigint_col = 10
| row-size=95B cardinality=11
|
03:NESTED LOOP JOIN [RIGHT OUTER JOIN]
| join predicates: a.id != b.id OR a.int_col < b.int_col
| row-size=178B cardinality=7.30K
|
|--01:SCAN HDFS [functional.alltypes b]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=89B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypestiny a]
HDFS partitions=4/4 files=4 size=460B
predicates: a.id < 10
row-size=89B cardinality=1
---- DISTRIBUTEDPLAN
NotImplementedException: Error generating a valid execution plan for this query. A RIGHT OUTER JOIN type with no equi-join predicates can only be executed with a single node plan.
====
# Right semi joins with non-equi join predicates
select straight_join *
from functional.alltypestiny a right semi join functional.alltypessmall c
on a.tinyint_col > c.tinyint_col
right semi join functional.alltypesagg d
on c.tinyint_col < d.bigint_col
where d.bigint_col < 10
---- PLAN
PLAN-ROOT SINK
|
04:NESTED LOOP JOIN [RIGHT SEMI JOIN]
| join predicates: c.tinyint_col < d.bigint_col
| row-size=95B cardinality=1.10K
|
|--02:SCAN HDFS [functional.alltypesagg d]
| HDFS partitions=11/11 files=11 size=814.73KB
| predicates: d.bigint_col < 10
| row-size=95B cardinality=1.10K
|
03:NESTED LOOP JOIN [RIGHT SEMI JOIN]
| join predicates: a.tinyint_col > c.tinyint_col
| row-size=1B cardinality=100
|
|--01:SCAN HDFS [functional.alltypessmall c]
| HDFS partitions=4/4 files=4 size=6.32KB
| row-size=1B cardinality=100
|
00:SCAN HDFS [functional.alltypestiny a]
HDFS partitions=4/4 files=4 size=460B
row-size=1B cardinality=8
---- DISTRIBUTEDPLAN
NotImplementedException: Error generating a valid execution plan for this query. A RIGHT SEMI JOIN type with no equi-join predicates can only be executed with a single node plan.
====
# Full outer joins with non-equi join predicates
select straight_join *
from functional.alltypestiny a full outer join functional.alltypessmall b
on a.id != b.id or a.int_col != b.int_col
full outer join functional.alltypesagg c
on a.tinyint_col > c.tinyint_col
full outer join functional.alltypes d
on c.int_col > d.int_col
where a.bigint_col != c.bigint_col and a.id < 10
---- PLAN
PLAN-ROOT SINK
|
06:NESTED LOOP JOIN [FULL OUTER JOIN]
| join predicates: c.int_col > d.int_col
| predicates: a.bigint_col != c.bigint_col, a.id < 10
| row-size=362B cardinality=18.40K
|
|--03:SCAN HDFS [functional.alltypes d]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=89B cardinality=7.30K
|
05:NESTED LOOP JOIN [FULL OUTER JOIN]
| join predicates: a.tinyint_col > c.tinyint_col
| row-size=273B cardinality=11.10K
|
|--02:SCAN HDFS [functional.alltypesagg c]
| HDFS partitions=11/11 files=11 size=814.73KB
| row-size=95B cardinality=11.00K
|
04:NESTED LOOP JOIN [FULL OUTER JOIN]
| join predicates: a.id != b.id OR a.int_col != b.int_col
| row-size=178B cardinality=101
|
|--01:SCAN HDFS [functional.alltypessmall b]
| HDFS partitions=4/4 files=4 size=6.32KB
| row-size=89B cardinality=100
|
00:SCAN HDFS [functional.alltypestiny a]
HDFS partitions=4/4 files=4 size=460B
predicates: a.id < 10
row-size=89B cardinality=1
---- DISTRIBUTEDPLAN
NotImplementedException: Error generating a valid execution plan for this query. A FULL OUTER JOIN type with no equi-join predicates can only be executed with a single node plan.
====
# Right anti join with non-equi join predicates
select straight_join count(*)
from functional.alltypestiny a right anti join functional.alltypessmall b
on a.id < b.id
where b.int_col = 5
---- PLAN
PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: count(*)
| row-size=8B cardinality=1
|
02:NESTED LOOP JOIN [RIGHT ANTI JOIN]
| join predicates: a.id < b.id
| row-size=8B cardinality=10
|
|--01:SCAN HDFS [functional.alltypessmall b]
| HDFS partitions=4/4 files=4 size=6.32KB
| predicates: b.int_col = 5
| row-size=8B cardinality=10
|
00:SCAN HDFS [functional.alltypestiny a]
HDFS partitions=4/4 files=4 size=460B
row-size=4B cardinality=8
---- DISTRIBUTEDPLAN
NotImplementedException: Error generating a valid execution plan for this query. A RIGHT ANTI JOIN type with no equi-join predicates can only be executed with a single node plan.
====
# Inner and right joins with non-equi join predicates
select straight_join count(*)
from functional.alltypestiny a inner join functional.alltypessmall b on a.id < b.id
right outer join functional.alltypesagg c on a.int_col != c.int_col
right semi join functional.alltypes d on c.tinyint_col < d.tinyint_col
right anti join functional.alltypesnopart e on d.tinyint_col > e.tinyint_col
where e.id < 10
---- PLAN
PLAN-ROOT SINK
|
09:AGGREGATE [FINALIZE]
| output: count(*)
| row-size=8B cardinality=1
|
08:NESTED LOOP JOIN [RIGHT ANTI JOIN]
| join predicates: d.tinyint_col > e.tinyint_col
| row-size=5B cardinality=0
|
|--04:SCAN HDFS [functional.alltypesnopart e]
| HDFS partitions=1/1 files=0 size=0B
| predicates: e.id < 10
| row-size=5B cardinality=0
|
07:NESTED LOOP JOIN [RIGHT SEMI JOIN]
| join predicates: c.tinyint_col < d.tinyint_col
| row-size=1B cardinality=7.30K
|
|--03:SCAN HDFS [functional.alltypes d]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=1B cardinality=7.30K
|
06:NESTED LOOP JOIN [RIGHT OUTER JOIN]
| join predicates: a.int_col != c.int_col
| row-size=17B cardinality=11.00K
|
|--02:SCAN HDFS [functional.alltypesagg c]
| HDFS partitions=11/11 files=11 size=814.73KB
| row-size=5B cardinality=11.00K
|
05:NESTED LOOP JOIN [INNER JOIN]
| predicates: a.id < b.id
| row-size=12B cardinality=8
|
|--01:SCAN HDFS [functional.alltypessmall b]
| HDFS partitions=4/4 files=4 size=6.32KB
| row-size=4B cardinality=100
|
00:SCAN HDFS [functional.alltypestiny a]
HDFS partitions=4/4 files=4 size=460B
row-size=8B cardinality=8
---- DISTRIBUTEDPLAN
NotImplementedException: Error generating a valid execution plan for this query. A RIGHT ANTI JOIN type with no equi-join predicates can only be executed with a single node plan.
====
# Right semi and outer joins are inverted to make them executable.
# Same query as above but without the straight join hint.
select count(*)
from functional.alltypestiny a inner join functional.alltypessmall b on a.id < b.id
right outer join functional.alltypesagg c on a.int_col != c.int_col
right semi join functional.alltypes d on c.tinyint_col < d.tinyint_col
right anti join functional.alltypesnopart e on d.tinyint_col > e.tinyint_col
where e.id < 10
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
15:AGGREGATE [FINALIZE]
| output: count:merge(*)
| row-size=8B cardinality=1
|
14:EXCHANGE [UNPARTITIONED]
|
09:AGGREGATE
| output: count(*)
| row-size=8B cardinality=1
|
08:NESTED LOOP JOIN [LEFT ANTI JOIN, BROADCAST]
| join predicates: d.tinyint_col > e.tinyint_col
| row-size=5B cardinality=0
|
|--13:EXCHANGE [BROADCAST]
| |
| 07:NESTED LOOP JOIN [LEFT SEMI JOIN, BROADCAST]
| | join predicates: c.tinyint_col < d.tinyint_col
| | row-size=1B cardinality=7.30K
| |
| |--12:EXCHANGE [BROADCAST]
| | |
| | 06:NESTED LOOP JOIN [LEFT OUTER JOIN, BROADCAST]
| | | join predicates: a.int_col != c.int_col
| | | row-size=17B cardinality=11.00K
| | |
| | |--11:EXCHANGE [BROADCAST]
| | | |
| | | 05:NESTED LOOP JOIN [INNER JOIN, BROADCAST]
| | | | predicates: a.id < b.id
| | | | row-size=12B cardinality=100
| | | |
| | | |--10:EXCHANGE [BROADCAST]
| | | | |
| | | | 00:SCAN HDFS [functional.alltypestiny a]
| | | | HDFS partitions=4/4 files=4 size=460B
| | | | row-size=8B cardinality=8
| | | |
| | | 01:SCAN HDFS [functional.alltypessmall b]
| | | HDFS partitions=4/4 files=4 size=6.32KB
| | | row-size=4B cardinality=100
| | |
| | 02:SCAN HDFS [functional.alltypesagg c]
| | HDFS partitions=11/11 files=11 size=814.73KB
| | row-size=5B cardinality=11.00K
| |
| 03:SCAN HDFS [functional.alltypes d]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=1B cardinality=7.30K
|
04:SCAN HDFS [functional.alltypesnopart e]
HDFS partitions=1/1 files=0 size=0B
predicates: e.id < 10
row-size=5B cardinality=0
====
# IMPALA-5689: Do not invert a left outer join with no equi-join predicates.
select * from (
select distinct int_col
from functional.alltypes) t1
left outer join functional.alltypes t2 on (t2.bigint_col=5)
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
07:EXCHANGE [UNPARTITIONED]
|
03:NESTED LOOP JOIN [LEFT OUTER JOIN, BROADCAST]
| row-size=93B cardinality=10
|
|--06:EXCHANGE [BROADCAST]
| |
| 02:SCAN HDFS [functional.alltypes t2]
| HDFS partitions=24/24 files=24 size=478.45KB
| predicates: (t2.bigint_col = 5)
| row-size=89B cardinality=730
|
05:AGGREGATE [FINALIZE]
| group by: int_col
| row-size=4B cardinality=10
|
04:EXCHANGE [HASH(int_col)]
|
01:AGGREGATE [STREAMING]
| group by: int_col
| row-size=4B cardinality=10
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=4B cardinality=7.30K
====
# IMPALA-5689: Do not invert a left semi join with no equi-join predicates.
select * from (
select distinct int_col
from functional.alltypes) t1
left semi join functional.alltypes t2 on (t2.bigint_col=5)
---- DISTRIBUTEDPLAN
PLAN-ROOT SINK
|
08:EXCHANGE [UNPARTITIONED]
|
03:NESTED LOOP JOIN [LEFT SEMI JOIN, BROADCAST]
| row-size=4B cardinality=10
|
|--07:EXCHANGE [BROADCAST]
| |
| 06:EXCHANGE [UNPARTITIONED]
| | limit: 1
| |
| 02:SCAN HDFS [functional.alltypes t2]
| HDFS partitions=24/24 files=24 size=478.45KB
| predicates: (t2.bigint_col = 5)
| limit: 1
| row-size=8B cardinality=730
|
05:AGGREGATE [FINALIZE]
| group by: int_col
| row-size=4B cardinality=10
|
04:EXCHANGE [HASH(int_col)]
|
01:AGGREGATE [STREAMING]
| group by: int_col
| row-size=4B cardinality=10
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=4B cardinality=7.30K
====