Files
impala/testdata/workloads/functional-query/queries/QueryTest/distinct.test
Srinath Shankar 74a975c45b IMPALA-862: count(x) may return null when a similar count(distinct x) is also used
count(x) with no distinct and no group-by expressions returns NULL on empty input
if other distinct aggs (e.g. COUNT(distinct x) are present.
This happens because the COUNT is transformed to SUM(COUNT()),
with the inner COUNT being evaluated WITH a group-by expression (e.g. x).
SUM over empty input returns NULL, but COUNT should return 0.

This patch fixes this by replacing COUNT with zeroifnull(COUNT) before AggregateInfo
is generated if there are distinct aggs and no group-bys. The logic in AggregateInfo
itself has not been modified.

Change-Id: I902e3fdd95767135b2f3fe423e8802ef57366af1
Reviewed-on: http://gerrit.ent.cloudera.com:8080/1921
Reviewed-by: Srinath Shankar <sshankar@cloudera.com>
Tested-by: jenkins
2014-03-14 23:35:55 -07:00

330 lines
7.9 KiB
Plaintext

====
---- QUERY
# count(distinct)
select count(distinct smallint_col, tinyint_col)
from alltypesagg
---- TYPES
bigint
---- RESULTS
90
====
---- QUERY
select count(distinct bool_col)
from alltypesagg
---- TYPES
bigint
---- RESULTS
2
====
---- QUERY
select count(distinct int_col)
from alltypesagg
---- TYPES
bigint
---- RESULTS
999
====
---- QUERY
select count(distinct float_col)
from alltypesagg
---- TYPES
bigint
---- RESULTS
999
====
---- QUERY
select count(distinct double_col)
from alltypesagg
---- TYPES
bigint
---- RESULTS
999
====
---- QUERY
select count(distinct string_col)
from alltypesagg
---- TYPES
bigint
---- RESULTS
1000
====
---- QUERY
select count(distinct date_string_col)
from alltypesagg
---- TYPES
bigint
---- RESULTS
10
====
---- QUERY
select count(distinct timestamp_col)
from alltypesagg
---- TYPES
bigint
---- RESULTS
10000
====
---- QUERY
# count(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct smallint_col), count(smallint_col)
from alltypesagg group by 1
---- TYPES
tinyint, bigint, bigint
---- RESULTS
1,10,1000
2,10,1000
3,10,1000
4,10,1000
5,10,1000
6,10,1000
7,10,1000
8,10,1000
9,10,1000
NULL,9,900
====
---- QUERY
# count(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct int_col, smallint_col), count(smallint_col)
from alltypesagg group by 1
---- TYPES
tinyint, bigint, bigint
---- RESULTS
1,100,1000
2,100,1000
3,100,1000
4,100,1000
5,100,1000
6,100,1000
7,100,1000
8,100,1000
9,100,1000
NULL,90,900
====
---- QUERY
# count(distinct) and sum(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
count(smallint_col)
from alltypesagg group by 1
---- TYPES
tinyint, bigint, bigint, bigint
---- RESULTS
1,10,460,1000
2,10,470,1000
3,10,480,1000
4,10,490,1000
5,10,500,1000
6,10,510,1000
7,10,520,1000
8,10,530,1000
9,10,540,1000
NULL,9,450,900
====
---- QUERY
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
# ignored
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
count(smallint_col), min(distinct int_col), max(distinct float_col)
from alltypesagg group by 1
---- TYPES
tinyint, bigint, bigint, bigint, int, float
---- RESULTS
1,10,460,1000,1,1090.099975585938
2,10,470,1000,2,1091.199951171875
3,10,480,1000,3,1092.300048828125
4,10,490,1000,4,1093.400024414062
5,10,500,1000,5,1094.5
6,10,510,1000,6,1095.599975585938
7,10,520,1000,7,1096.699951171875
8,10,530,1000,8,1097.800048828125
9,10,540,1000,9,1098.900024414062
NULL,9,450,900,10,1089
====
---- QUERY
# count distinct order by the same agg expr
select count(distinct id) as count_id from alltypessmall order by count_id limit 100
---- TYPES
bigint
---- RESULTS
100
====
---- QUERY
# count distinct order by a diff agg expr
select count(distinct id) as sum_id from alltypessmall order by max(distinct id)
limit 100
---- TYPES
bigint
---- RESULTS
100
====
---- QUERY
# distinct *
select distinct *
from alltypesagg
where id < 20
---- TYPES
int, boolean, tinyint, smallint, int, bigint, float, double, string, string, timestamp, int, int, int
---- RESULTS
0,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/10','0',2010-01-01 00:00:00,2010,1,1
1,false,1,1,1,10,1.100000023841858,10.1,'01/01/10','1',2010-01-01 00:01:00,2010,1,1
10,true,NULL,10,10,100,11,101,'01/01/10','10',2010-01-01 00:10:00.450000000,2010,1,1
11,false,1,11,11,110,12.10000038146973,111.1,'01/01/10','11',2010-01-01 00:11:00.550000000,2010,1,1
12,true,2,12,12,120,13.19999980926514,121.2,'01/01/10','12',2010-01-01 00:12:00.660000000,2010,1,1
13,false,3,13,13,130,14.30000019073486,131.3,'01/01/10','13',2010-01-01 00:13:00.780000000,2010,1,1
14,true,4,14,14,140,15.39999961853027,141.4,'01/01/10','14',2010-01-01 00:14:00.910000000,2010,1,1
15,false,5,15,15,150,16.5,151.5,'01/01/10','15',2010-01-01 00:15:01.500000000,2010,1,1
16,true,6,16,16,160,17.60000038146973,161.6,'01/01/10','16',2010-01-01 00:16:01.200000000,2010,1,1
17,false,7,17,17,170,18.70000076293945,171.7,'01/01/10','17',2010-01-01 00:17:01.360000000,2010,1,1
18,true,8,18,18,180,19.79999923706055,181.8,'01/01/10','18',2010-01-01 00:18:01.530000000,2010,1,1
19,false,9,19,19,190,20.89999961853027,191.9,'01/01/10','19',2010-01-01 00:19:01.710000000,2010,1,1
2,true,2,2,2,20,2.200000047683716,20.2,'01/01/10','2',2010-01-01 00:02:00.100000000,2010,1,1
3,false,3,3,3,30,3.299999952316284,30.3,'01/01/10','3',2010-01-01 00:03:00.300000000,2010,1,1
4,true,4,4,4,40,4.400000095367432,40.4,'01/01/10','4',2010-01-01 00:04:00.600000000,2010,1,1
5,false,5,5,5,50,5.5,50.5,'01/01/10','5',2010-01-01 00:05:00.100000000,2010,1,1
6,true,6,6,6,60,6.599999904632568,60.6,'01/01/10','6',2010-01-01 00:06:00.150000000,2010,1,1
7,false,7,7,7,70,7.699999809265137,70.7,'01/01/10','7',2010-01-01 00:07:00.210000000,2010,1,1
8,true,8,8,8,80,8.800000190734863,80.8,'01/01/10','8',2010-01-01 00:08:00.280000000,2010,1,1
9,false,9,9,9,90,9.899999618530273,90.90000000000001,'01/01/10','9',2010-01-01 00:09:00.360000000,2010,1,1
====
---- QUERY
# distinct w/ explicit select list
select distinct bool_col, tinyint_col
from alltypesagg
---- TYPES
boolean, tinyint
---- RESULTS
false,1
false,3
false,5
false,7
false,9
true,2
true,4
true,6
true,8
true,NULL
====
---- QUERY
# test count distinct with nulls w/o groupby
select count(distinct NULL) from alltypesagg
---- TYPES
bigint
---- RESULTS
0
====
---- QUERY
# test count distinct with nulls w/o groupby
select count(distinct int_col, NULL) from alltypesagg
---- TYPES
bigint
---- RESULTS
0
====
---- QUERY
# test count distinct with nulls and groupby
select tinyint_col, count(distinct NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- TYPES
tinyint, bigint
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
====
---- QUERY
select tinyint_col, count(distinct int_col, NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- TYPES
tinyint, bigint
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
====
---- QUERY
# make sure we still return the NULL group even though NULLs are not added to COUNT
select tinyint_col, count(distinct tinyint_col, NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- TYPES
tinyint, bigint
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
====
---- QUERY
# test join on inline views containing distinct aggregates to make sure
# the aggregation info reports the correct tuple ids (from the 2nd phase
# distinct aggregation) for the inline-view expression substitution
select t1.c, t2.c from
(select count(distinct int_col) as c from functional.alltypestiny) t1 inner join
(select count(distinct bigint_col) as c from functional.alltypestiny) t2 on (t1.c = t2.c)
---- TYPES
bigint, bigint
---- RESULTS
2,2
====
---- QUERY
# Test distinct + non-distinct aggregates without group by and having clause (IMPALA-845)
# alltypessmall has 100 non-null values for bigint_col, expect a single result
select count(distinct tinyint_col) from functional.alltypessmall
having count(bigint_col) > 99
---- TYPES
bigint
---- RESULTS
10
====
---- QUERY
# Test distinct + non-distinct aggregates without group by and having clause (IMPALA-845)
# alltypessmall has 100 non-null values for bigint_col, expect an empty result
select count(distinct tinyint_col) from functional.alltypessmall
having count(bigint_col) > 100
---- TYPES
bigint
---- RESULTS
====
---- QUERY
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
select count(*), COUNT(distinct 1) from alltypesagg where false
---- RESULTS
0,0
---- TYPES
bigint, bigint
====
---- QUERY
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
select count(tinyint_col), sum(distinct int_col) from alltypesagg
---- RESULTS
9000,499500
---- TYPES
bigint, bigint
====
---- QUERY
# Regression test for COUNT(ALL ) with no group-by and other distinct agg. IMPALA-862
select count(*), COUNT(distinct 1) from alltypesagg
---- RESULTS
10000,1
---- TYPES
bigint, bigint
====