Files
impala/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
Alan Choi 88101bc90e This patch implements the probabilistic counting algorithm as an aggregate
"distinctpc" and "distinctpcsa".

We've gathered statistics on an internal dataset (all columns) which is
part of our regression data. It's roughly 400mb, ~100 columns,
int/bigint/string type.

On Hive, it took roughly 64sec.
On this Impala implementation, it took 35sec. By adding inline to hash-util.h (which we don't),
 we can achieve 24~26sec.

Change-Id: Ibcba3c9512b49e8b9eb0c2fec59dfd27f14f84c3
2014-01-08 10:44:27 -08:00

669 lines
14 KiB
Plaintext

# Test the distinctpcsa aggregate function on all col types with group by
select
tinyint_col,
distinctpcsa(id),
distinctpcsa(bool_col),
distinctpcsa(smallint_col),
distinctpcsa(int_col),
distinctpcsa(bigint_col),
distinctpcsa(float_col),
distinctpcsa(double_col),
distinctpcsa(string_col),
distinctpcsa(timestamp_col)
from alltypesagg
group by tinyint_col
order by tinyint_col
limit 100
---- TYPES
tinyint, string, string, string, string, string, string, string, string, string
---- RESULTS
1,'1101','82','87','142','136','137','124','117','1149'
2,'1162','82','88','137','137','153','134','117','926'
3,'1125','82','86','146','153','145','133','117','967'
4,'1031','82','87','143','140','134','143','117','1031'
5,'1175','82','87','146','140','156','145','117','1009'
6,'1077','82','86','146','146','151','139','117','988'
7,'1213','82','88','146','148','139','148','117','1101'
8,'1137','82','87','145','139','156','133','117','1077'
9,'998','82','87','140','143','148','130','117','1101'
NULL,'1162','82','87','151','136','146','140','117','1031'
====
# Test the distinctpcsa aggregate function on all col types without group by
select
distinctpcsa(id),
distinctpcsa(bool_col),
distinctpcsa(tinyint_col),
distinctpcsa(smallint_col),
distinctpcsa(int_col),
distinctpcsa(bigint_col),
distinctpcsa(float_col),
distinctpcsa(double_col),
distinctpcsa(string_col),
distinctpcsa(timestamp_col)
from alltypesagg
---- TYPES
string, string, string, string, string, string, string, string, string, string
---- RESULTS
'3744','82','88','172','1832','1089','1077','1009','988','9817'
====
# Test the distinctpc aggregate function on all col types with group by
select
tinyint_col,
distinctpc(id),
distinctpc(bool_col),
distinctpc(smallint_col),
distinctpc(int_col),
distinctpc(bigint_col),
distinctpc(float_col),
distinctpc(double_col),
distinctpc(string_col),
distinctpc(timestamp_col)
from alltypesagg
group by tinyint_col
order by tinyint_col
limit 100
---- TYPES
tinyint, string, string, string, string, string, string, string, string, string
---- RESULTS
1,'1089','1','11','120','70','99','96','82','1125'
2,'1043','1','11','105','94','88','92','82','886'
3,'1043','1','11','105','107','106','81','82','906'
4,'1009','1','9','109','108','103','97','165','977'
5,'1009','1','9','109','122','96','123','165','988'
6,'977','1','12','101','112','98','119','165','956'
7,'977','1','12','101','95','118','99','165','1227'
8,'1043','1','8','107','85','100','124','82','1149'
9,'1043','1','8','107','88','96','91','82','1125'
NULL,'1089','1','6','119','86','119','93','82','998'
====
# Test the distinctpc aggregate function on all col types without group by
select
distinctpc(id),
distinctpc(bool_col),
distinctpc(tinyint_col),
distinctpc(smallint_col),
distinctpc(int_col),
distinctpc(bigint_col),
distinctpc(float_col),
distinctpc(double_col),
distinctpc(string_col),
distinctpc(timestamp_col)
from alltypesagg
---- TYPES
string, string, string, string, string, string, string, string, string, string
---- RESULTS
'2647','3','3','122','1323','1295','1101','1031','1077','11930'
====
# Test the distinctpc aggregate function on empty table
select distinctpc(field) from EmptyTable
---- TYPES
string
---- RESULTS
'0'
====
# no grouping exprs, cols contain nulls
select count(*), count(tinyint_col), min(tinyint_col), max(tinyint_col), sum(tinyint_col),
avg(tinyint_col)
from alltypesagg$TABLE
---- TYPES
bigint, bigint, tinyint, tinyint, bigint, double
---- RESULTS
10000,9000,1,9,45000,5
====
select count(*), count(smallint_col), min(smallint_col), max(smallint_col), sum(smallint_col),
avg(smallint_col)
from alltypesagg$TABLE
---- TYPES
bigint, bigint, smallint, smallint, bigint, double
---- RESULTS
10000,9900,1,99,495000,50
====
select count(*), count(int_col), min(int_col), max(int_col), sum(int_col), avg(int_col)
from alltypesagg$TABLE
---- TYPES
bigint, bigint, int, int, bigint, double
---- RESULTS
10000,9990,1,999,4995000,500
====
select count(*), count(bigint_col), min(bigint_col), max(bigint_col), sum(bigint_col),
avg(bigint_col)
from alltypesagg$TABLE
---- TYPES
bigint, bigint, bigint, bigint, bigint, double
---- RESULTS
10000,9990,10,9990,49950000,5000
====
select count(*), count(float_col), min(float_col), max(float_col), sum(float_col),
avg(float_col)
from alltypesagg$TABLE
---- TYPES
bigint, bigint, float, float, double, double
---- RESULTS
10000,9990,1.100000023841858,1098.900024414062,5494499.999767542,549.9999999767309
====
select count(*), count(double_col), min(double_col), max(double_col), sum(double_col),
avg(double_col)
from alltypesagg$TABLE
---- TYPES
bigint, bigint, double, double, double, double
---- RESULTS
10000,9990,10.1,10089.9,50449500,5050
====
select count(*), min(string_col), max(string_col), min(date_string_col),
max(date_string_col)
from alltypesagg$TABLE
---- TYPES
bigint, string, string, string, string
---- RESULTS
10000,'0','999','01/01/10','01/10/10'
====
select min(timestamp_col), max(timestamp_col), avg(timestamp_col) from alltypesagg$TABLE
---- TYPES
timestamp, timestamp, timestamp
---- RESULTS
2010-01-01 00:00:00,2010-01-10 18:02:05.100000000,2010-01-05 20:47:11.705086469
====
# grouping by different data types, with NULLs
select tinyint_col, count(*) from alltypesagg$TABLE group by 1
---- TYPES
tinyint, bigint
---- RESULTS
1,1000
2,1000
3,1000
4,1000
5,1000
6,1000
7,1000
8,1000
9,1000
NULL,1000
====
# grouping by different data types, with NULLs, grouping expr missing from select list
select count(*) from alltypesagg$TABLE group by tinyint_col
---- TYPES
bigint
---- RESULTS
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
====
select smallint_col % 10, count(*) from alltypesagg$TABLE group by 1
---- TYPES
smallint, bigint
---- RESULTS
0,900
1,1000
2,1000
3,1000
4,1000
5,1000
6,1000
7,1000
8,1000
9,1000
NULL,100
====
select count(*) from alltypesagg$TABLE group by smallint_col % 10
---- TYPES
bigint
---- RESULTS
100
1000
1000
1000
1000
1000
1000
1000
1000
1000
900
====
select int_col % 10, count(*) from alltypesagg$TABLE group by 1
---- TYPES
int, bigint
---- RESULTS
0,990
1,1000
2,1000
3,1000
4,1000
5,1000
6,1000
7,1000
8,1000
9,1000
NULL,10
====
select count(*) from alltypesagg$TABLE group by int_col % 10
---- TYPES
bigint
---- RESULTS
10
1000
1000
1000
1000
1000
1000
1000
1000
1000
990
====
select bigint_col % 100, count(*) from alltypesagg$TABLE group by 1
---- TYPES
bigint, bigint
---- RESULTS
0,990
10,1000
20,1000
30,1000
40,1000
50,1000
60,1000
70,1000
80,1000
90,1000
NULL,10
====
select count(*) from alltypesagg$TABLE group by bigint_col % 100
---- TYPES
bigint
---- RESULTS
10
1000
1000
1000
1000
1000
1000
1000
1000
1000
990
====
select date_string_col, count(*) from alltypesagg$TABLE group by 1
---- TYPES
string, bigint
---- RESULTS
'01/01/10',1000
'01/02/10',1000
'01/03/10',1000
'01/04/10',1000
'01/05/10',1000
'01/06/10',1000
'01/07/10',1000
'01/08/10',1000
'01/09/10',1000
'01/10/10',1000
====
select count(*) from alltypesagg$TABLE group by date_string_col
---- TYPES
bigint
---- RESULTS
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
====
# grouping by multiple exprs, with nulls
select tinyint_col % 3, smallint_col % 3, count(*) from alltypesagg$TABLE
where day = 1 group by 1, 2
---- TYPES
tinyint, smallint, bigint
---- RESULTS
0,0,120
0,1,90
0,2,90
1,0,90
1,1,120
1,2,90
2,0,90
2,1,90
2,2,120
NULL,0,30
NULL,1,30
NULL,2,30
NULL,NULL,10
====
select count(*) from alltypesagg$TABLE
where day = 1 group by tinyint_col % 3, smallint_col % 3
---- TYPES
bigint
---- RESULTS
10
120
120
120
30
30
30
90
90
90
90
90
90
====
# same result as previous query
select tinyint_col % 3, smallint_col % 3, count(*) from alltypesagg$TABLE where day = 1 group by 2, 1
---- TYPES
tinyint, smallint, bigint
---- RESULTS
0,0,120
0,1,90
0,2,90
1,0,90
1,1,120
1,2,90
2,0,90
2,1,90
2,2,120
NULL,0,30
NULL,1,30
NULL,2,30
NULL,NULL,10
====
select tinyint_col % 2, smallint_col % 2, int_col % 2, bigint_col % 2, date_string_col, count(*)
from alltypesagg$TABLE
where date_string_col = '01/01/10' or date_string_col = '01/02/10'
group by 1, 2, 3, 4, 5
---- TYPES
tinyint, smallint, int, bigint, string, bigint
---- RESULTS
0,0,0,0,'01/01/10',400
0,0,0,0,'01/02/10',400
1,1,1,0,'01/01/10',500
1,1,1,0,'01/02/10',500
NULL,0,0,0,'01/01/10',90
NULL,0,0,0,'01/02/10',90
NULL,NULL,0,0,'01/01/10',9
NULL,NULL,0,0,'01/02/10',9
NULL,NULL,NULL,NULL,'01/01/10',1
NULL,NULL,NULL,NULL,'01/02/10',1
====
select count(*)
from alltypesagg$TABLE
where date_string_col = '01/01/10' or date_string_col = '01/02/10'
group by tinyint_col % 2, smallint_col % 2, int_col % 2, bigint_col % 2, date_string_col
---- TYPES
bigint
---- RESULTS
1
1
400
400
500
500
9
9
90
90
====
# no grouping cols, no matching rows
select count(*), min(tinyint_col), max(tinyint_col), sum(tinyint_col), avg(tinyint_col)
from alltypesagg$TABLE
where tinyint_col = -1
---- TYPES
bigint, tinyint, tinyint, bigint, double
---- RESULTS
0,NULL,NULL,NULL,NULL
====
select count(*), min(smallint_col), max(smallint_col), sum(smallint_col), avg(smallint_col)
from alltypesagg$TABLE
where smallint_col = -1
---- TYPES
bigint, smallint, smallint, bigint, double
---- RESULTS
0,NULL,NULL,NULL,NULL
====
select count(*), min(int_col), max(int_col), sum(int_col), avg(int_col)
from alltypesagg$TABLE
where int_col = -1
---- TYPES
bigint, int, int, bigint, double
---- RESULTS
0,NULL,NULL,NULL,NULL
====
select count(*), min(bigint_col), max(bigint_col), sum(bigint_col), avg(bigint_col)
from alltypesagg$TABLE
where bigint_col = -1
---- TYPES
bigint, bigint, bigint, bigint, double
---- RESULTS
0,NULL,NULL,NULL,NULL
====
select count(*), min(float_col), max(float_col), sum(float_col), avg(float_col)
from alltypesagg$TABLE
where float_col < -1.0
---- TYPES
bigint, float, float, double, double
---- RESULTS
0,NULL,NULL,NULL,NULL
====
select count(*), min(double_col), max(double_col), sum(double_col), avg(double_col)
from alltypesagg$TABLE
where double_col < -1.0
---- TYPES
bigint, double, double, double, double
---- RESULTS
0,NULL,NULL,NULL,NULL
====
# HAVING clauses over all aggregation functions, plus compound HAVING clauses
select int_col % 7, count(*), max(int_col) from alltypesagg$TABLE group by 1
---- TYPES
int, bigint, int
---- RESULTS
0,1420,994
1,1430,995
2,1430,996
3,1430,997
4,1430,998
5,1430,999
6,1420,993
NULL,10,NULL
====
select int_col % 7, count(*) from alltypesagg$TABLE group by 1 having max(int_col) > 991
---- TYPES
int, bigint
---- RESULTS
0,1420
1,1430
2,1430
3,1430
4,1430
5,1430
6,1420
====
select int_col % 7, count(*) from alltypesagg$TABLE group by 1
having max(int_col) > 991 and count(*) > 1420
---- TYPES
int, bigint
---- RESULTS
1,1430
2,1430
3,1430
4,1430
5,1430
====
select int_col % 7, count(*) from alltypesagg$TABLE group by 1
having min(int_col) < 7
---- TYPES
int, bigint
---- RESULTS
1,1430
2,1430
3,1430
4,1430
5,1430
6,1420
====
select int_col % 7, count(*) from alltypesagg$TABLE group by 1
having min(int_col) < 7 and count(*) > 1420
---- TYPES
int, bigint
---- RESULTS
1,1430
2,1430
3,1430
4,1430
5,1430
====
select int_col % 7, count(*), sum(int_col) from alltypesagg$TABLE group by 1
---- TYPES
int, bigint, bigint
---- RESULTS
0,1420,710710
1,1430,712140
2,1430,713570
3,1430,715000
4,1430,716430
5,1430,717860
6,1420,709290
NULL,10,NULL
====
select int_col % 7, count(*), sum(int_col) from alltypesagg$TABLE group by 1
having sum(int_col) >= 715000
---- TYPES
int, bigint, bigint
---- RESULTS
3,1430,715000
4,1430,716430
5,1430,717860
====
select int_col % 7, count(*), sum(int_col) from alltypesagg$TABLE group by 1
having sum(int_col) >= 715000 or count(*) > 1420
---- TYPES
int, bigint, bigint
---- RESULTS
1,1430,712140
2,1430,713570
3,1430,715000
4,1430,716430
5,1430,717860
====
select int_col % 7, count(*), sum(int_col) from alltypesagg$TABLE group by 1
having sum(int_col) is null
---- TYPES
int, bigint, bigint
---- RESULTS
NULL,10,NULL
====
select int_col % 7, count(*), avg(int_col) from alltypesagg$TABLE group by 1
---- TYPES
int, bigint, double
---- RESULTS
0,1420,500.5
1,1430,498
2,1430,499
3,1430,500
4,1430,501
5,1430,502
6,1420,499.5
NULL,10,NULL
====
select int_col % 7, count(*), avg(int_col) from alltypesagg$TABLE group by 1
having avg(int_col) > 500
---- TYPES
int, bigint, double
---- RESULTS
0,1420,500.5
4,1430,501
5,1430,502
====
select int_col % 7, count(*), avg(int_col) from alltypesagg$TABLE group by 1
having avg(int_col) > 500 or count(*) = 10
---- TYPES
int, bigint, double
---- RESULTS
0,1420,500.5
4,1430,501
5,1430,502
NULL,10,NULL
====
select timestamp_col, count(*) from alltypesagg$TABLE
group by timestamp_col having timestamp_col < cast('2010-01-01 01:05:20' as timestamp)
---- TYPES
timestamp, bigint
---- RESULTS
2010-01-01 00:00:00,1
2010-01-01 00:01:00,1
2010-01-01 00:02:00.100000000,1
2010-01-01 00:03:00.300000000,1
2010-01-01 00:04:00.600000000,1
2010-01-01 00:05:00.100000000,1
2010-01-01 00:06:00.150000000,1
2010-01-01 00:07:00.210000000,1
2010-01-01 00:08:00.280000000,1
2010-01-01 00:09:00.360000000,1
2010-01-01 00:10:00.450000000,1
2010-01-01 00:11:00.550000000,1
2010-01-01 00:12:00.660000000,1
2010-01-01 00:13:00.780000000,1
2010-01-01 00:14:00.910000000,1
2010-01-01 00:15:01.500000000,1
2010-01-01 00:16:01.200000000,1
2010-01-01 00:17:01.360000000,1
2010-01-01 00:18:01.530000000,1
2010-01-01 00:19:01.710000000,1
2010-01-01 00:20:01.900000000,1
2010-01-01 00:21:02.100000000,1
2010-01-01 00:22:02.310000000,1
2010-01-01 00:23:02.530000000,1
2010-01-01 00:24:02.760000000,1
2010-01-01 00:25:03,1
2010-01-01 00:26:03.250000000,1
2010-01-01 00:27:03.510000000,1
2010-01-01 00:28:03.780000000,1
2010-01-01 00:29:04.600000000,1
2010-01-01 00:30:04.350000000,1
2010-01-01 00:31:04.650000000,1
2010-01-01 00:32:04.960000000,1
2010-01-01 00:33:05.280000000,1
2010-01-01 00:34:05.610000000,1
2010-01-01 00:35:05.950000000,1
2010-01-01 00:36:06.300000000,1
2010-01-01 00:37:06.660000000,1
2010-01-01 00:38:07.300000000,1
2010-01-01 00:39:07.410000000,1
2010-01-01 00:40:07.800000000,1
2010-01-01 00:41:08.200000000,1
2010-01-01 00:42:08.610000000,1
2010-01-01 00:43:09.300000000,1
2010-01-01 00:44:09.460000000,1
2010-01-01 00:45:09.900000000,1
2010-01-01 00:46:10.350000000,1
2010-01-01 00:47:10.810000000,1
2010-01-01 00:48:11.280000000,1
2010-01-01 00:49:11.760000000,1
2010-01-01 00:50:12.250000000,1
2010-01-01 00:51:12.750000000,1
2010-01-01 00:52:13.260000000,1
2010-01-01 00:53:13.780000000,1
2010-01-01 00:54:14.310000000,1
2010-01-01 00:55:14.850000000,1
2010-01-01 00:56:15.400000000,1
2010-01-01 00:57:15.960000000,1
2010-01-01 00:58:16.530000000,1
2010-01-01 00:59:17.110000000,1
2010-01-01 01:00:17.700000000,1
2010-01-01 01:01:18.300000000,1
2010-01-01 01:02:18.910000000,1
2010-01-01 01:03:19.530000000,1
2010-01-01 01:04:20.160000000,1
====