mirror of
https://github.com/apache/impala.git
synced 2026-01-18 15:00:35 -05:00
IMPALA-1270: add distinct aggregation to semi joins
When generating plans with left semi/anti joins (typically resulting from subquery rewrites), the planner now considers inserting a distinct aggregation on the inner side of the join. The decision is based on whether that aggregation would reduce the number of rows by more than 75%. This is fairly conservative and the optimization might be beneficial for smaller reductions, but the conservative threshold is chosen to reduce the number of potential plan regressions. The aggregation can both reduce the # of rows and the width of the rows, by projecting out unneeded slots. ENABLE_DISTINCT_SEMI_JOIN_OPTIMIZATION query option is added to allow toggling the optimization. Tests: * Add positive and negative planner tests for various cases - including semi/anti joins, missing stats, broadcast/shuffle, different numbers of join predicates. * Add some end-to-end tests to verify plans execute correctly. Change-Id: Icbb955e805d9e764edf11c57b98f341b88a37fcc Reviewed-on: http://gerrit.cloudera.org:8080/16180 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
0b5a9889ee
commit
63f5e8ec00
@@ -1434,3 +1434,88 @@ from functional.alltypestiny
|
||||
---- TYPES
|
||||
INT, INT
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-1270: test that distinct subquery is executable and returns correct results.
|
||||
select id from alltypestiny
|
||||
where int_col in (select int_col from alltypes where id % 2 = 0)
|
||||
---- RESULTS
|
||||
0
|
||||
2
|
||||
4
|
||||
6
|
||||
---- TYPES
|
||||
INT
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-1270: test that distinct subquery with anti join is executable and
|
||||
# returns correct results.
|
||||
select id from alltypestiny
|
||||
where int_col not in (select int_col from alltypes where id % 2 = 0)
|
||||
---- RESULTS
|
||||
1
|
||||
3
|
||||
5
|
||||
7
|
||||
---- TYPES
|
||||
INT
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-1270: test that subquery with no join predicates is executable and
|
||||
# returns correct results. A limit is added by the planner.
|
||||
select id from alltypestiny
|
||||
where exists (select int_col from alltypes where id % 2 = 0)
|
||||
---- RESULTS
|
||||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
---- TYPES
|
||||
INT
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-1270: test subquery with multiple join predicates with distinct
|
||||
# added returns correct results.
|
||||
select count(*) from alltypesagg t1
|
||||
where int_col in (
|
||||
select int_col from alltypes t2
|
||||
where t1.bool_col = t2.bool_col and id is not null);
|
||||
---- RESULTS
|
||||
90
|
||||
---- TYPES
|
||||
BIGINT
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-1270: test subquery with aggregate function returns correct results.
|
||||
select id from alltypesagg t1
|
||||
where int_col in (
|
||||
select count(*)
|
||||
from alltypes t2
|
||||
group by int_col, tinyint_col)
|
||||
---- RESULTS
|
||||
730
|
||||
730
|
||||
1730
|
||||
1730
|
||||
2730
|
||||
2730
|
||||
3730
|
||||
3730
|
||||
4730
|
||||
4730
|
||||
5730
|
||||
5730
|
||||
6730
|
||||
6730
|
||||
7730
|
||||
7730
|
||||
8730
|
||||
8730
|
||||
9730
|
||||
9730
|
||||
---- TYPES
|
||||
INT
|
||||
====
|
||||
|
||||
Reference in New Issue
Block a user