impala/testdata/workloads/functional-query/queries/QueryTest/distinct.test

---- QUERY
# count(distinct)
select count(distinct smallint_col, tinyint_col)
from alltypesagg
---- RESULTS
90
---- TYPES
bigint
====
---- QUERY
select count(distinct bool_col)
from alltypesagg
---- RESULTS
2
---- TYPES
bigint
====
---- QUERY
select count(distinct int_col)
from alltypesagg
---- RESULTS
999
---- TYPES
bigint
====
---- QUERY
select count(distinct float_col)
from alltypesagg
---- RESULTS
999
---- TYPES
bigint
====
---- QUERY
select count(distinct double_col)
from alltypesagg
---- RESULTS
999
---- TYPES
bigint
====
---- QUERY
select count(distinct string_col)
from alltypesagg
---- RESULTS
1000
---- TYPES
bigint
====
---- QUERY
select count(distinct date_string_col)
from alltypesagg
---- RESULTS
10
---- TYPES
bigint
====
---- QUERY
select count(distinct timestamp_col)
from alltypesagg
---- RESULTS
10000
---- TYPES
bigint
====
---- QUERY
# count(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct smallint_col), count(smallint_col)
from alltypesagg group by 1
---- RESULTS
2,10,1000
3,10,1000
NULL,9,1800
4,10,1000
8,10,1000
5,10,1000
9,10,1000
7,10,1000
1,10,1000
6,10,1000
---- TYPES
tinyint, bigint, bigint
====
---- QUERY
# count(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct int_col, smallint_col), count(smallint_col)
from alltypesagg group by 1
---- RESULTS
2,100,1000
3,100,1000
NULL,90,1800
4,100,1000
8,100,1000
5,100,1000
9,100,1000
7,100,1000
1,100,1000
6,100,1000
---- TYPES
tinyint, bigint, bigint
====
---- QUERY
# count(distinct) and sum(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
count(smallint_col)
from alltypesagg group by 1
---- RESULTS
2,10,470,1000
3,10,480,1000
NULL,9,450,1800
4,10,490,1000
8,10,530,1000
5,10,500,1000
9,10,540,1000
7,10,520,1000
1,10,460,1000
6,10,510,1000
---- TYPES
tinyint, bigint, bigint, bigint
====
---- QUERY
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
# ignored
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
count(smallint_col), min(distinct int_col), max(distinct float_col)
from alltypesagg group by 1
---- RESULTS
2,10,470,1000,2,1091.199951171875
3,10,480,1000,3,1092.300048828125
NULL,9,450,1800,10,1089
4,10,490,1000,4,1093.400024414062
8,10,530,1000,8,1097.800048828125
5,10,500,1000,5,1094.5
9,10,540,1000,9,1098.900024414062
7,10,520,1000,7,1096.699951171875
1,10,460,1000,1,1090.099975585938
6,10,510,1000,6,1095.599975585938
---- TYPES
tinyint, bigint, bigint, bigint, int, float
====
---- QUERY
# count distinct order by the same agg expr
select count(distinct id) as count_id from alltypessmall order by count_id limit 100
---- RESULTS
100
---- TYPES
bigint
====
---- QUERY
# count distinct order by a diff agg expr
select count(distinct id) as sum_id from alltypessmall order by max(distinct id)
limit 100
---- RESULTS
100
---- TYPES
bigint
====
---- QUERY
# distinct *
select distinct *
from alltypesagg
where id < 20
---- RESULTS
6,true,6,6,6,60,6.599999904632568,60.59999999999999,'01/01/10','6',2010-01-01 00:06:00.150000000,2010,1,1
14,true,4,14,14,140,15.39999961853027,141.4,'01/01/10','14',2010-01-01 00:14:00.910000000,2010,1,1
18,true,8,18,18,180,19.79999923706055,181.8,'01/01/10','18',2010-01-01 00:18:01.530000000,2010,1,1
19,false,9,19,19,190,20.89999961853027,191.9,'01/01/10','19',2010-01-01 00:19:01.710000000,2010,1,1
1,false,1,1,1,10,1.100000023841858,10.1,'01/01/10','1',2010-01-01 00:01:00,2010,1,1
2,true,2,2,2,20,2.200000047683716,20.2,'01/01/10','2',2010-01-01 00:02:00.100000000,2010,1,1
0,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/10','0',2010-01-01 00:00:00,2010,1,1
7,false,7,7,7,70,7.699999809265137,70.7,'01/01/10','7',2010-01-01 00:07:00.210000000,2010,1,1
9,false,9,9,9,90,9.899999618530273,90.89999999999999,'01/01/10','9',2010-01-01 00:09:00.360000000,2010,1,1
11,false,1,11,11,110,12.10000038146973,111.1,'01/01/10','11',2010-01-01 00:11:00.550000000,2010,1,1
3,false,3,3,3,30,3.299999952316284,30.3,'01/01/10','3',2010-01-01 00:03:00.300000000,2010,1,1
15,false,5,15,15,150,16.5,151.5,'01/01/10','15',2010-01-01 00:15:01.500000000,2010,1,1
10,true,NULL,10,10,100,11,101,'01/01/10','10',2010-01-01 00:10:00.450000000,2010,1,1
17,false,7,17,17,170,18.70000076293945,171.7,'01/01/10','17',2010-01-01 00:17:01.360000000,2010,1,1
5,false,5,5,5,50,5.5,50.5,'01/01/10','5',2010-01-01 00:05:00.100000000,2010,1,1
10,true,NULL,10,10,100,11,101,'01/01/10','10',2010-01-01 00:10:00.450000000,2010,1,NULL
8,true,8,8,8,80,8.800000190734863,80.8,'01/01/10','8',2010-01-01 00:08:00.280000000,2010,1,1
13,false,3,13,13,130,14.30000019073486,131.3,'01/01/10','13',2010-01-01 00:13:00.780000000,2010,1,1
12,true,2,12,12,120,13.19999980926514,121.2,'01/01/10','12',2010-01-01 00:12:00.660000000,2010,1,1
16,true,6,16,16,160,17.60000038146973,161.6,'01/01/10','16',2010-01-01 00:16:01.200000000,2010,1,1
4,true,4,4,4,40,4.400000095367432,40.4,'01/01/10','4',2010-01-01 00:04:00.600000000,2010,1,1
0,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/10','0',2010-01-01 00:00:00,2010,1,NULL
---- TYPES
int, boolean, tinyint, smallint, int, bigint, float, double, string, string, timestamp, int, int, int
====
---- QUERY
# distinct w/ explicit select list
select distinct bool_col, tinyint_col
from alltypesagg
---- RESULTS
false,1
false,3
false,5
false,7
false,9
true,2
true,4
true,6
true,8
true,NULL
---- TYPES
boolean, tinyint
====
---- QUERY
# test count distinct with nulls w/o groupby
select count(distinct NULL) from alltypesagg
---- RESULTS
0
---- TYPES
bigint
====
---- QUERY
# test count distinct with nulls w/o groupby
select count(distinct int_col, NULL) from alltypesagg
---- RESULTS
0
---- TYPES
bigint
====
---- QUERY
# test count distinct with nulls and groupby
select tinyint_col, count(distinct NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
---- TYPES
tinyint, bigint
====
---- QUERY
select tinyint_col, count(distinct int_col, NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
---- TYPES
tinyint, bigint
====
---- QUERY
# make sure we still return the NULL group even though NULLs are not added to COUNT
select tinyint_col, count(distinct tinyint_col, NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
---- TYPES
tinyint, bigint
====
---- QUERY
# test join on inline views containing distinct aggregates to make sure
# the aggregation info reports the correct tuple ids (from the 2nd phase
# distinct aggregation) for the inline-view expression substitution
select t1.c, t2.c from
(select count(distinct int_col) as c from functional.alltypestiny) t1 inner join
(select count(distinct bigint_col) as c from functional.alltypestiny) t2 on (t1.c = t2.c)
---- RESULTS
2,2
---- TYPES
bigint, bigint
====
---- QUERY
# Test distinct + non-distinct aggregates without group by and having clause (IMPALA-845)
# alltypessmall has 100 non-null values for bigint_col, expect a single result
select count(distinct tinyint_col) from functional.alltypessmall
having count(bigint_col) > 99
---- RESULTS
10
---- TYPES
bigint
====
---- QUERY
# Test distinct + non-distinct aggregates without group by and having clause (IMPALA-845)
# alltypessmall has 100 non-null values for bigint_col, expect an empty result
select count(distinct tinyint_col) from functional.alltypessmall
having count(bigint_col) > 100
---- RESULTS
---- TYPES
bigint
====
---- QUERY
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
select count(*), COUNT(distinct 1) from alltypesagg where false
---- RESULTS
0,0
---- TYPES
bigint, bigint
====
---- QUERY
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
select count(tinyint_col), sum(distinct int_col) from alltypesagg
---- RESULTS
9000,499500
---- TYPES
bigint, bigint
====
---- QUERY
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
select count(*), COUNT(distinct 1) from alltypesagg
---- RESULTS
11000,1
---- TYPES
bigint, bigint
====
---- QUERY
# Test distinct + non-distinct aggregation with intermediate tuples (IMPALA-1247).
select avg(int_col), count(int_col), avg(distinct bigint_col)
from functional.alltypes
---- RESULTS
4.5,7300,45
---- TYPES
double, bigint, double
====
---- QUERY
# Test distinct + non-distinct aggregation with intermediate tuples (IMPALA-1247).
select bool_col, avg(int_col), count(int_col), avg(distinct bigint_col)
from functional.alltypes
group by bool_col
order by bool_col
---- RESULTS
false,5,3650,50
true,4,3650,40
---- TYPES
boolean, double, bigint, double
====
---- QUERY
# Test rewriting count(distinct) into NDV() via a query option.
set appx_count_distinct=true;
select count(distinct int_col), count(distinct float_col), count(distinct string_col)
from alltypesagg
---- RESULTS
1881,944,1178
---- TYPES
bigint, bigint, bigint
====
---- QUERY
# Unset the query option from the previous test.
set appx_count_distinct=false
---- RESULTS
---- TYPES
string, string
====
---- QUERY
# Large (more than 1 regular batches) distinct with multiple group by keys.
SELECT COUNT(*) FROM
(SELECT COUNT(DISTINCT p_partkey)
 FROM tpch_parquet.part GROUP BY p_brand, p_type, p_size) v
---- RESULTS
123039
---- TYPES
bigint
====