mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
Based on Google's HyperLogLog++ paper. Uses a bias correcting interpolation as a sub algorithm for Hll estimates within a specific range. Change-Id: If4fe692b4308f6a57aea6167e9bc00db11eaaab9 Reviewed-on: http://gerrit.cloudera.org:8080/415 Tested-by: Internal Jenkins Reviewed-by: Henry Robinson <henry@cloudera.com>
142 lines
3.6 KiB
Plaintext
142 lines
3.6 KiB
Plaintext
# These computations take a while so we do not want to add too many test cases
|
|
# They also require a non-trivial amount of data to get reasonable results.
|
|
====
|
|
---- QUERY
|
|
select count(distinct id), distinctpc(id), distinctpcsa(id) from alltypesagg
|
|
---- TYPES
|
|
bigint, bigint, bigint
|
|
---- RESULTS
|
|
10000,10590,17429
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpcsa aggregate function on all col types with group by
|
|
select
|
|
tinyint_col,
|
|
distinctpcsa(id),
|
|
distinctpcsa(bool_col),
|
|
distinctpcsa(smallint_col),
|
|
distinctpcsa(int_col),
|
|
distinctpcsa(bigint_col),
|
|
distinctpcsa(float_col),
|
|
distinctpcsa(double_col),
|
|
distinctpcsa(string_col),
|
|
distinctpcsa(timestamp_col)
|
|
from alltypesagg
|
|
group by tinyint_col
|
|
---- TYPES
|
|
tinyint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint
|
|
---- RESULTS
|
|
2,904,2,9,123,101,107,134,131,1053
|
|
7,1124,0,12,107,107,114,107,138,829
|
|
8,976,2,14,105,84,105,110,138,802
|
|
NULL,1100,2,12,110,95,97,110,131,811
|
|
5,1149,0,12,103,93,118,107,138,914
|
|
3,1200,0,9,107,95,116,105,138,895
|
|
6,997,2,12,103,90,116,110,136,1187
|
|
9,866,0,14,93,95,105,101,136,838
|
|
4,1042,2,12,90,86,107,105,136,924
|
|
1,1065,0,12,118,90,116,105,138,955
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpcsa aggregate function on all col types without group by
|
|
select
|
|
distinctpcsa(id),
|
|
distinctpcsa(bool_col),
|
|
distinctpcsa(tinyint_col),
|
|
distinctpcsa(smallint_col),
|
|
distinctpcsa(int_col),
|
|
distinctpcsa(bigint_col),
|
|
distinctpcsa(float_col),
|
|
distinctpcsa(double_col),
|
|
distinctpcsa(string_col),
|
|
distinctpcsa(timestamp_col)
|
|
from alltypesagg
|
|
---- TYPES
|
|
bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint
|
|
---- RESULTS
|
|
17429,2,12,59,541,1213,866,1042,240,10476
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpc aggregate function on all col types with group by
|
|
select
|
|
tinyint_col,
|
|
distinctpc(id),
|
|
distinctpc(bool_col),
|
|
distinctpc(smallint_col),
|
|
distinctpc(int_col),
|
|
distinctpc(bigint_col),
|
|
distinctpc(float_col),
|
|
distinctpc(double_col),
|
|
distinctpc(string_col),
|
|
distinctpc(timestamp_col)
|
|
from alltypesagg
|
|
group by tinyint_col
|
|
---- TYPES
|
|
tinyint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint
|
|
---- RESULTS
|
|
1,896,1,11,20,89,112,88,151,977
|
|
2,936,1,11,20,100,153,94,180,1043
|
|
3,936,1,11,20,109,118,90,182,896
|
|
4,977,1,14,20,81,84,83,190,946
|
|
5,977,1,14,20,96,97,100,151,840
|
|
6,998,1,13,20,97,128,101,178,1077
|
|
7,998,1,13,20,86,117,105,158,858
|
|
8,896,1,13,20,88,96,101,155,745
|
|
9,896,1,13,20,109,142,109,182,926
|
|
NULL,896,1,11,20,91,97,85,176,753
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpc aggregate function on all col types without group by
|
|
select
|
|
distinctpc(id),
|
|
distinctpc(bool_col),
|
|
distinctpc(tinyint_col),
|
|
distinctpc(smallint_col),
|
|
distinctpc(int_col),
|
|
distinctpc(bigint_col),
|
|
distinctpc(float_col),
|
|
distinctpc(double_col),
|
|
distinctpc(string_col),
|
|
distinctpc(timestamp_col)
|
|
from alltypesagg
|
|
---- TYPES
|
|
bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint
|
|
---- RESULTS
|
|
10590,3,7,63,20,1295,936,1020,330,9711
|
|
====
|
|
---- QUERY
|
|
# Test the hll aggregate function on all col types without group by
|
|
select
|
|
ndv(id),
|
|
ndv(bool_col),
|
|
ndv(tinyint_col),
|
|
ndv(smallint_col),
|
|
ndv(int_col),
|
|
ndv(bigint_col),
|
|
ndv(float_col),
|
|
ndv(double_col),
|
|
ndv(string_col),
|
|
ndv(timestamp_col)
|
|
from alltypesagg
|
|
---- TYPES
|
|
bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint, bigint
|
|
---- RESULTS
|
|
10280,2,9,97,957,1008,1016,1005,963,10210
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpc aggregate function on empty table
|
|
select distinctpc(field) from EmptyTable
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
0
|
|
====
|
|
---- QUERY
|
|
# Test the hll aggregate function on empty table
|
|
select ndv(field) from EmptyTable
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
0
|
|
====
|