mirror of
https://github.com/apache/impala.git
synced 2026-01-07 18:02:33 -05:00
IMPALA-4794 changed the distinct aggregation behavior to shuffling by both grouping exprs and the distinct expr. It's slower in queries where the NDVs of grouping exprs are high and data are uniformly distributed among groups. This patch adds a query option controlling this behavior, letting users switch to the old plan. Change-Id: Icb4b4576fb29edd62cf4b4ba0719c0e0a2a5a8dc Reviewed-on: http://gerrit.cloudera.org:8080/9949 Reviewed-by: Tianyi Wang <twang@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
403 lines
9.9 KiB
Plaintext
403 lines
9.9 KiB
Plaintext
====
|
|
---- QUERY
|
|
# count(distinct)
|
|
select count(distinct smallint_col, tinyint_col)
|
|
from alltypesagg
|
|
---- RESULTS
|
|
90
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
select count(distinct bool_col)
|
|
from alltypesagg
|
|
---- RESULTS
|
|
2
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
select count(distinct int_col)
|
|
from alltypesagg
|
|
---- RESULTS
|
|
999
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
select count(distinct float_col)
|
|
from alltypesagg
|
|
---- RESULTS
|
|
999
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
select count(distinct double_col)
|
|
from alltypesagg
|
|
---- RESULTS
|
|
999
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
select count(distinct string_col)
|
|
from alltypesagg
|
|
---- RESULTS
|
|
1000
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
select count(distinct date_string_col)
|
|
from alltypesagg
|
|
---- RESULTS
|
|
10
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
select count(distinct timestamp_col)
|
|
from alltypesagg
|
|
---- RESULTS
|
|
10000
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# count(distinct) w/ grouping and non-distinct count()
|
|
select tinyint_col, count(distinct smallint_col), count(smallint_col)
|
|
from alltypesagg group by 1
|
|
---- RESULTS
|
|
2,10,1000
|
|
3,10,1000
|
|
NULL,9,1800
|
|
4,10,1000
|
|
8,10,1000
|
|
5,10,1000
|
|
9,10,1000
|
|
7,10,1000
|
|
1,10,1000
|
|
6,10,1000
|
|
---- TYPES
|
|
tinyint, bigint, bigint
|
|
====
|
|
---- QUERY
|
|
# count(distinct) w/ grouping and non-distinct count()
|
|
select tinyint_col, count(distinct int_col, smallint_col), count(smallint_col)
|
|
from alltypesagg group by 1
|
|
---- RESULTS
|
|
2,100,1000
|
|
3,100,1000
|
|
NULL,90,1800
|
|
4,100,1000
|
|
8,100,1000
|
|
5,100,1000
|
|
9,100,1000
|
|
7,100,1000
|
|
1,100,1000
|
|
6,100,1000
|
|
---- TYPES
|
|
tinyint, bigint, bigint
|
|
====
|
|
---- QUERY
|
|
# count(distinct) and sum(distinct) w/ grouping and non-distinct count()
|
|
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
|
|
count(smallint_col)
|
|
from alltypesagg group by 1
|
|
---- RESULTS
|
|
2,10,470,1000
|
|
3,10,480,1000
|
|
NULL,9,450,1800
|
|
4,10,490,1000
|
|
8,10,530,1000
|
|
5,10,500,1000
|
|
9,10,540,1000
|
|
7,10,520,1000
|
|
1,10,460,1000
|
|
6,10,510,1000
|
|
---- TYPES
|
|
tinyint, bigint, bigint, bigint
|
|
====
|
|
---- QUERY
|
|
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
|
|
# ignored
|
|
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
|
|
count(smallint_col), min(distinct int_col), max(distinct float_col)
|
|
from alltypesagg group by 1
|
|
---- RESULTS
|
|
2,10,470,1000,2,1091.199951171875
|
|
3,10,480,1000,3,1092.300048828125
|
|
NULL,9,450,1800,10,1089
|
|
4,10,490,1000,4,1093.400024414062
|
|
8,10,530,1000,8,1097.800048828125
|
|
5,10,500,1000,5,1094.5
|
|
9,10,540,1000,9,1098.900024414062
|
|
7,10,520,1000,7,1096.699951171875
|
|
1,10,460,1000,1,1090.099975585938
|
|
6,10,510,1000,6,1095.599975585938
|
|
---- TYPES
|
|
tinyint, bigint, bigint, bigint, int, float
|
|
====
|
|
---- QUERY
|
|
# count distinct order by the same agg expr
|
|
select count(distinct id) as count_id from alltypessmall order by count_id limit 100
|
|
---- RESULTS
|
|
100
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# count distinct order by a diff agg expr
|
|
select count(distinct id) as sum_id from alltypessmall order by max(distinct id)
|
|
limit 100
|
|
---- RESULTS
|
|
100
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# distinct *
|
|
select distinct *
|
|
from alltypesagg
|
|
where id < 20
|
|
---- RESULTS
|
|
6,true,6,6,6,60,6.599999904632568,60.59999999999999,'01/01/10','6',2010-01-01 00:06:00.150000000,2010,1,1
|
|
14,true,4,14,14,140,15.39999961853027,141.4,'01/01/10','14',2010-01-01 00:14:00.910000000,2010,1,1
|
|
18,true,8,18,18,180,19.79999923706055,181.8,'01/01/10','18',2010-01-01 00:18:01.530000000,2010,1,1
|
|
19,false,9,19,19,190,20.89999961853027,191.9,'01/01/10','19',2010-01-01 00:19:01.710000000,2010,1,1
|
|
1,false,1,1,1,10,1.100000023841858,10.1,'01/01/10','1',2010-01-01 00:01:00,2010,1,1
|
|
2,true,2,2,2,20,2.200000047683716,20.2,'01/01/10','2',2010-01-01 00:02:00.100000000,2010,1,1
|
|
0,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/10','0',2010-01-01 00:00:00,2010,1,1
|
|
7,false,7,7,7,70,7.699999809265137,70.7,'01/01/10','7',2010-01-01 00:07:00.210000000,2010,1,1
|
|
9,false,9,9,9,90,9.899999618530273,90.89999999999999,'01/01/10','9',2010-01-01 00:09:00.360000000,2010,1,1
|
|
11,false,1,11,11,110,12.10000038146973,111.1,'01/01/10','11',2010-01-01 00:11:00.550000000,2010,1,1
|
|
3,false,3,3,3,30,3.299999952316284,30.3,'01/01/10','3',2010-01-01 00:03:00.300000000,2010,1,1
|
|
15,false,5,15,15,150,16.5,151.5,'01/01/10','15',2010-01-01 00:15:01.500000000,2010,1,1
|
|
10,true,NULL,10,10,100,11,101,'01/01/10','10',2010-01-01 00:10:00.450000000,2010,1,1
|
|
17,false,7,17,17,170,18.70000076293945,171.7,'01/01/10','17',2010-01-01 00:17:01.360000000,2010,1,1
|
|
5,false,5,5,5,50,5.5,50.5,'01/01/10','5',2010-01-01 00:05:00.100000000,2010,1,1
|
|
10,true,NULL,10,10,100,11,101,'01/01/10','10',2010-01-01 00:10:00.450000000,2010,1,NULL
|
|
8,true,8,8,8,80,8.800000190734863,80.8,'01/01/10','8',2010-01-01 00:08:00.280000000,2010,1,1
|
|
13,false,3,13,13,130,14.30000019073486,131.3,'01/01/10','13',2010-01-01 00:13:00.780000000,2010,1,1
|
|
12,true,2,12,12,120,13.19999980926514,121.2,'01/01/10','12',2010-01-01 00:12:00.660000000,2010,1,1
|
|
16,true,6,16,16,160,17.60000038146973,161.6,'01/01/10','16',2010-01-01 00:16:01.200000000,2010,1,1
|
|
4,true,4,4,4,40,4.400000095367432,40.4,'01/01/10','4',2010-01-01 00:04:00.600000000,2010,1,1
|
|
0,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/10','0',2010-01-01 00:00:00,2010,1,NULL
|
|
---- TYPES
|
|
int, boolean, tinyint, smallint, int, bigint, float, double, string, string, timestamp, int, int, int
|
|
====
|
|
---- QUERY
|
|
# distinct w/ explicit select list
|
|
select distinct bool_col, tinyint_col
|
|
from alltypesagg
|
|
---- RESULTS
|
|
false,1
|
|
false,3
|
|
false,5
|
|
false,7
|
|
false,9
|
|
true,2
|
|
true,4
|
|
true,6
|
|
true,8
|
|
true,NULL
|
|
---- TYPES
|
|
boolean, tinyint
|
|
====
|
|
---- QUERY
|
|
# test count distinct with nulls w/o groupby
|
|
select count(distinct NULL) from alltypesagg
|
|
---- RESULTS
|
|
0
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# test count distinct with nulls w/o groupby
|
|
select count(distinct int_col, NULL) from alltypesagg
|
|
---- RESULTS
|
|
0
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# test count distinct with nulls and groupby
|
|
select tinyint_col, count(distinct NULL) from alltypesagg group by tinyint_col
|
|
order by tinyint_col limit 10
|
|
---- RESULTS
|
|
1,0
|
|
2,0
|
|
3,0
|
|
4,0
|
|
5,0
|
|
6,0
|
|
7,0
|
|
8,0
|
|
9,0
|
|
NULL,0
|
|
---- TYPES
|
|
tinyint, bigint
|
|
====
|
|
---- QUERY
|
|
select tinyint_col, count(distinct int_col, NULL) from alltypesagg group by tinyint_col
|
|
order by tinyint_col limit 10
|
|
---- RESULTS
|
|
1,0
|
|
2,0
|
|
3,0
|
|
4,0
|
|
5,0
|
|
6,0
|
|
7,0
|
|
8,0
|
|
9,0
|
|
NULL,0
|
|
---- TYPES
|
|
tinyint, bigint
|
|
====
|
|
---- QUERY
|
|
# make sure we still return the NULL group even though NULLs are not added to COUNT
|
|
select tinyint_col, count(distinct tinyint_col, NULL) from alltypesagg group by tinyint_col
|
|
order by tinyint_col limit 10
|
|
---- RESULTS
|
|
1,0
|
|
2,0
|
|
3,0
|
|
4,0
|
|
5,0
|
|
6,0
|
|
7,0
|
|
8,0
|
|
9,0
|
|
NULL,0
|
|
---- TYPES
|
|
tinyint, bigint
|
|
====
|
|
---- QUERY
|
|
# test join on inline views containing distinct aggregates to make sure
|
|
# the aggregation info reports the correct tuple ids (from the 2nd phase
|
|
# distinct aggregation) for the inline-view expression substitution
|
|
select t1.c, t2.c from
|
|
(select count(distinct int_col) as c from functional.alltypestiny) t1 inner join
|
|
(select count(distinct bigint_col) as c from functional.alltypestiny) t2 on (t1.c = t2.c)
|
|
---- RESULTS
|
|
2,2
|
|
---- TYPES
|
|
bigint, bigint
|
|
====
|
|
---- QUERY
|
|
# Test distinct + non-distinct aggregates without group by and having clause (IMPALA-845)
|
|
# alltypessmall has 100 non-null values for bigint_col, expect a single result
|
|
select count(distinct tinyint_col) from functional.alltypessmall
|
|
having count(bigint_col) > 99
|
|
---- RESULTS
|
|
10
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# Test distinct + non-distinct aggregates without group by and having clause (IMPALA-845)
|
|
# alltypessmall has 100 non-null values for bigint_col, expect an empty result
|
|
select count(distinct tinyint_col) from functional.alltypessmall
|
|
having count(bigint_col) > 100
|
|
---- RESULTS
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
|
|
select count(*), COUNT(distinct 1) from alltypesagg where false
|
|
---- RESULTS
|
|
0,0
|
|
---- TYPES
|
|
bigint, bigint
|
|
====
|
|
---- QUERY
|
|
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
|
|
select count(tinyint_col), sum(distinct int_col) from alltypesagg
|
|
---- RESULTS
|
|
9000,499500
|
|
---- TYPES
|
|
bigint, bigint
|
|
====
|
|
---- QUERY
|
|
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
|
|
select count(*), COUNT(distinct 1) from alltypesagg
|
|
---- RESULTS
|
|
11000,1
|
|
---- TYPES
|
|
bigint, bigint
|
|
====
|
|
---- QUERY
|
|
# Test distinct + non-distinct aggregation with intermediate tuples (IMPALA-1247).
|
|
select avg(int_col), count(int_col), avg(distinct bigint_col)
|
|
from functional.alltypes
|
|
---- RESULTS
|
|
4.5,7300,45
|
|
---- TYPES
|
|
double, bigint, double
|
|
====
|
|
---- QUERY
|
|
# Test distinct + non-distinct aggregation with intermediate tuples (IMPALA-1247).
|
|
select bool_col, avg(int_col), count(int_col), avg(distinct bigint_col)
|
|
from functional.alltypes
|
|
group by bool_col
|
|
order by bool_col
|
|
---- RESULTS
|
|
false,5,3650,50
|
|
true,4,3650,40
|
|
---- TYPES
|
|
boolean, double, bigint, double
|
|
====
|
|
---- QUERY
|
|
# Test rewriting count(distinct) into NDV() via a query option.
|
|
set appx_count_distinct=true;
|
|
select count(distinct int_col), count(distinct float_col), count(distinct string_col)
|
|
from alltypesagg
|
|
---- RESULTS
|
|
957,1016,963
|
|
---- TYPES
|
|
bigint, bigint, bigint
|
|
====
|
|
---- QUERY
|
|
# Large (more than 1 regular batches) distinct with multiple group by keys.
|
|
SELECT COUNT(*) FROM
|
|
(SELECT COUNT(DISTINCT p_partkey)
|
|
FROM tpch_parquet.part GROUP BY p_brand, p_type, p_size) v
|
|
---- RESULTS
|
|
123039
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# Distinct agg without a grouping expr and with a compatible child partition
|
|
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
|
|
functional.alltypes b on a.int_col = b.int_col;
|
|
---- RESULTS
|
|
10
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# Distinct agg with a grouping expr. The input is partitioned by grouping exprs.
|
|
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
|
|
functional.alltypes b on a.year = b.year group by a.year;
|
|
---- RESULTS
|
|
10
|
|
10
|
|
---- TYPES
|
|
bigint
|
|
====
|
|
---- QUERY
|
|
# Distinct agg with a grouping expr. The input is partitioned by grouping exprs and
|
|
# distinct exprs.
|
|
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
|
|
functional.alltypes b on a.year = b.year and a.int_col = b.int_col group by a.year;
|
|
---- RESULTS
|
|
10
|
|
10
|
|
---- TYPES
|
|
bigint
|
|
====
|