Files
impala/testdata/workloads/functional-query/queries/QueryTest/distinct.test
Tianyi Wang 9a751f00b8 IMPALA-6822: Add a query option to control shuffling by distinct exprs
IMPALA-4794 changed the distinct aggregation behavior to shuffling by
both grouping exprs and the distinct expr. It's slower in queries
where the NDVs of grouping exprs are high and data are uniformly
distributed among groups. This patch adds a query option controlling
this behavior, letting users switch to the old plan.

Change-Id: Icb4b4576fb29edd62cf4b4ba0719c0e0a2a5a8dc
Reviewed-on: http://gerrit.cloudera.org:8080/9949
Reviewed-by: Tianyi Wang <twang@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2018-04-12 22:01:35 +00:00

403 lines
9.9 KiB
Plaintext

====
---- QUERY
# count(distinct)
select count(distinct smallint_col, tinyint_col)
from alltypesagg
---- RESULTS
90
---- TYPES
bigint
====
---- QUERY
select count(distinct bool_col)
from alltypesagg
---- RESULTS
2
---- TYPES
bigint
====
---- QUERY
select count(distinct int_col)
from alltypesagg
---- RESULTS
999
---- TYPES
bigint
====
---- QUERY
select count(distinct float_col)
from alltypesagg
---- RESULTS
999
---- TYPES
bigint
====
---- QUERY
select count(distinct double_col)
from alltypesagg
---- RESULTS
999
---- TYPES
bigint
====
---- QUERY
select count(distinct string_col)
from alltypesagg
---- RESULTS
1000
---- TYPES
bigint
====
---- QUERY
select count(distinct date_string_col)
from alltypesagg
---- RESULTS
10
---- TYPES
bigint
====
---- QUERY
select count(distinct timestamp_col)
from alltypesagg
---- RESULTS
10000
---- TYPES
bigint
====
---- QUERY
# count(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct smallint_col), count(smallint_col)
from alltypesagg group by 1
---- RESULTS
2,10,1000
3,10,1000
NULL,9,1800
4,10,1000
8,10,1000
5,10,1000
9,10,1000
7,10,1000
1,10,1000
6,10,1000
---- TYPES
tinyint, bigint, bigint
====
---- QUERY
# count(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct int_col, smallint_col), count(smallint_col)
from alltypesagg group by 1
---- RESULTS
2,100,1000
3,100,1000
NULL,90,1800
4,100,1000
8,100,1000
5,100,1000
9,100,1000
7,100,1000
1,100,1000
6,100,1000
---- TYPES
tinyint, bigint, bigint
====
---- QUERY
# count(distinct) and sum(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
count(smallint_col)
from alltypesagg group by 1
---- RESULTS
2,10,470,1000
3,10,480,1000
NULL,9,450,1800
4,10,490,1000
8,10,530,1000
5,10,500,1000
9,10,540,1000
7,10,520,1000
1,10,460,1000
6,10,510,1000
---- TYPES
tinyint, bigint, bigint, bigint
====
---- QUERY
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
# ignored
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
count(smallint_col), min(distinct int_col), max(distinct float_col)
from alltypesagg group by 1
---- RESULTS
2,10,470,1000,2,1091.199951171875
3,10,480,1000,3,1092.300048828125
NULL,9,450,1800,10,1089
4,10,490,1000,4,1093.400024414062
8,10,530,1000,8,1097.800048828125
5,10,500,1000,5,1094.5
9,10,540,1000,9,1098.900024414062
7,10,520,1000,7,1096.699951171875
1,10,460,1000,1,1090.099975585938
6,10,510,1000,6,1095.599975585938
---- TYPES
tinyint, bigint, bigint, bigint, int, float
====
---- QUERY
# count distinct order by the same agg expr
select count(distinct id) as count_id from alltypessmall order by count_id limit 100
---- RESULTS
100
---- TYPES
bigint
====
---- QUERY
# count distinct order by a diff agg expr
select count(distinct id) as sum_id from alltypessmall order by max(distinct id)
limit 100
---- RESULTS
100
---- TYPES
bigint
====
---- QUERY
# distinct *
select distinct *
from alltypesagg
where id < 20
---- RESULTS
6,true,6,6,6,60,6.599999904632568,60.59999999999999,'01/01/10','6',2010-01-01 00:06:00.150000000,2010,1,1
14,true,4,14,14,140,15.39999961853027,141.4,'01/01/10','14',2010-01-01 00:14:00.910000000,2010,1,1
18,true,8,18,18,180,19.79999923706055,181.8,'01/01/10','18',2010-01-01 00:18:01.530000000,2010,1,1
19,false,9,19,19,190,20.89999961853027,191.9,'01/01/10','19',2010-01-01 00:19:01.710000000,2010,1,1
1,false,1,1,1,10,1.100000023841858,10.1,'01/01/10','1',2010-01-01 00:01:00,2010,1,1
2,true,2,2,2,20,2.200000047683716,20.2,'01/01/10','2',2010-01-01 00:02:00.100000000,2010,1,1
0,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/10','0',2010-01-01 00:00:00,2010,1,1
7,false,7,7,7,70,7.699999809265137,70.7,'01/01/10','7',2010-01-01 00:07:00.210000000,2010,1,1
9,false,9,9,9,90,9.899999618530273,90.89999999999999,'01/01/10','9',2010-01-01 00:09:00.360000000,2010,1,1
11,false,1,11,11,110,12.10000038146973,111.1,'01/01/10','11',2010-01-01 00:11:00.550000000,2010,1,1
3,false,3,3,3,30,3.299999952316284,30.3,'01/01/10','3',2010-01-01 00:03:00.300000000,2010,1,1
15,false,5,15,15,150,16.5,151.5,'01/01/10','15',2010-01-01 00:15:01.500000000,2010,1,1
10,true,NULL,10,10,100,11,101,'01/01/10','10',2010-01-01 00:10:00.450000000,2010,1,1
17,false,7,17,17,170,18.70000076293945,171.7,'01/01/10','17',2010-01-01 00:17:01.360000000,2010,1,1
5,false,5,5,5,50,5.5,50.5,'01/01/10','5',2010-01-01 00:05:00.100000000,2010,1,1
10,true,NULL,10,10,100,11,101,'01/01/10','10',2010-01-01 00:10:00.450000000,2010,1,NULL
8,true,8,8,8,80,8.800000190734863,80.8,'01/01/10','8',2010-01-01 00:08:00.280000000,2010,1,1
13,false,3,13,13,130,14.30000019073486,131.3,'01/01/10','13',2010-01-01 00:13:00.780000000,2010,1,1
12,true,2,12,12,120,13.19999980926514,121.2,'01/01/10','12',2010-01-01 00:12:00.660000000,2010,1,1
16,true,6,16,16,160,17.60000038146973,161.6,'01/01/10','16',2010-01-01 00:16:01.200000000,2010,1,1
4,true,4,4,4,40,4.400000095367432,40.4,'01/01/10','4',2010-01-01 00:04:00.600000000,2010,1,1
0,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/10','0',2010-01-01 00:00:00,2010,1,NULL
---- TYPES
int, boolean, tinyint, smallint, int, bigint, float, double, string, string, timestamp, int, int, int
====
---- QUERY
# distinct w/ explicit select list
select distinct bool_col, tinyint_col
from alltypesagg
---- RESULTS
false,1
false,3
false,5
false,7
false,9
true,2
true,4
true,6
true,8
true,NULL
---- TYPES
boolean, tinyint
====
---- QUERY
# test count distinct with nulls w/o groupby
select count(distinct NULL) from alltypesagg
---- RESULTS
0
---- TYPES
bigint
====
---- QUERY
# test count distinct with nulls w/o groupby
select count(distinct int_col, NULL) from alltypesagg
---- RESULTS
0
---- TYPES
bigint
====
---- QUERY
# test count distinct with nulls and groupby
select tinyint_col, count(distinct NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
---- TYPES
tinyint, bigint
====
---- QUERY
select tinyint_col, count(distinct int_col, NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
---- TYPES
tinyint, bigint
====
---- QUERY
# make sure we still return the NULL group even though NULLs are not added to COUNT
select tinyint_col, count(distinct tinyint_col, NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
---- TYPES
tinyint, bigint
====
---- QUERY
# test join on inline views containing distinct aggregates to make sure
# the aggregation info reports the correct tuple ids (from the 2nd phase
# distinct aggregation) for the inline-view expression substitution
select t1.c, t2.c from
(select count(distinct int_col) as c from functional.alltypestiny) t1 inner join
(select count(distinct bigint_col) as c from functional.alltypestiny) t2 on (t1.c = t2.c)
---- RESULTS
2,2
---- TYPES
bigint, bigint
====
---- QUERY
# Test distinct + non-distinct aggregates without group by and having clause (IMPALA-845)
# alltypessmall has 100 non-null values for bigint_col, expect a single result
select count(distinct tinyint_col) from functional.alltypessmall
having count(bigint_col) > 99
---- RESULTS
10
---- TYPES
bigint
====
---- QUERY
# Test distinct + non-distinct aggregates without group by and having clause (IMPALA-845)
# alltypessmall has 100 non-null values for bigint_col, expect an empty result
select count(distinct tinyint_col) from functional.alltypessmall
having count(bigint_col) > 100
---- RESULTS
---- TYPES
bigint
====
---- QUERY
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
select count(*), COUNT(distinct 1) from alltypesagg where false
---- RESULTS
0,0
---- TYPES
bigint, bigint
====
---- QUERY
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
select count(tinyint_col), sum(distinct int_col) from alltypesagg
---- RESULTS
9000,499500
---- TYPES
bigint, bigint
====
---- QUERY
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
select count(*), COUNT(distinct 1) from alltypesagg
---- RESULTS
11000,1
---- TYPES
bigint, bigint
====
---- QUERY
# Test distinct + non-distinct aggregation with intermediate tuples (IMPALA-1247).
select avg(int_col), count(int_col), avg(distinct bigint_col)
from functional.alltypes
---- RESULTS
4.5,7300,45
---- TYPES
double, bigint, double
====
---- QUERY
# Test distinct + non-distinct aggregation with intermediate tuples (IMPALA-1247).
select bool_col, avg(int_col), count(int_col), avg(distinct bigint_col)
from functional.alltypes
group by bool_col
order by bool_col
---- RESULTS
false,5,3650,50
true,4,3650,40
---- TYPES
boolean, double, bigint, double
====
---- QUERY
# Test rewriting count(distinct) into NDV() via a query option.
set appx_count_distinct=true;
select count(distinct int_col), count(distinct float_col), count(distinct string_col)
from alltypesagg
---- RESULTS
957,1016,963
---- TYPES
bigint, bigint, bigint
====
---- QUERY
# Large (more than 1 regular batches) distinct with multiple group by keys.
SELECT COUNT(*) FROM
(SELECT COUNT(DISTINCT p_partkey)
FROM tpch_parquet.part GROUP BY p_brand, p_type, p_size) v
---- RESULTS
123039
---- TYPES
bigint
====
---- QUERY
# Distinct agg without a grouping expr and with a compatible child partition
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
functional.alltypes b on a.int_col = b.int_col;
---- RESULTS
10
---- TYPES
bigint
====
---- QUERY
# Distinct agg with a grouping expr. The input is partitioned by grouping exprs.
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
functional.alltypes b on a.year = b.year group by a.year;
---- RESULTS
10
10
---- TYPES
bigint
====
---- QUERY
# Distinct agg with a grouping expr. The input is partitioned by grouping exprs and
# distinct exprs.
select count(distinct a.int_col) from functional.alltypes a inner join [shuffle]
functional.alltypes b on a.year = b.year and a.int_col = b.int_col group by a.year;
---- RESULTS
10
10
---- TYPES
bigint
====