mirror of
https://github.com/apache/impala.git
synced 2025-12-31 06:02:51 -05:00
This is implemented in the BE using HLL (but we could change this in the future). These estimates usually work better than the other algorithm we have and we've not implemented all the improvements from the google paper. Change-Id: Ied715ddd0e1a7cbe7f5f90469f1ed3d4b9c537c7 Reviewed-on: http://gerrit.ent.cloudera.com:8080/956 Reviewed-by: Nong Li <nong@cloudera.com> Tested-by: jenkins
142 lines
4.1 KiB
Plaintext
142 lines
4.1 KiB
Plaintext
# These computations take a while so we do not want to add too many test cases
|
|
# They also require a non-trivial amount of data to get reasonable results.
|
|
====
|
|
---- QUERY
|
|
select count(distinct id), distinctpc(id), distinctpcsa(id) from alltypesagg
|
|
---- TYPES
|
|
bigint, string, string
|
|
---- RESULTS
|
|
10000,'10590','17429'
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpcsa aggregate function on all col types with group by
|
|
select
|
|
tinyint_col,
|
|
distinctpcsa(id),
|
|
distinctpcsa(bool_col),
|
|
distinctpcsa(smallint_col),
|
|
distinctpcsa(int_col),
|
|
distinctpcsa(bigint_col),
|
|
distinctpcsa(float_col),
|
|
distinctpcsa(double_col),
|
|
distinctpcsa(string_col),
|
|
distinctpcsa(timestamp_col)
|
|
from alltypesagg
|
|
group by tinyint_col
|
|
---- TYPES
|
|
tinyint, string, string, string, string, string, string, string, string, string
|
|
---- RESULTS
|
|
3,'1200','82','86','140','131','146','139','163','151'
|
|
NULL,'1101','83','87','142','131','133','142','158','140'
|
|
6,'998','83','87','137','128','146','142','161','143'
|
|
9,'867','82','88','130','131','139','136','161','114'
|
|
8,'977','83','88','139','124','139','142','163','139'
|
|
2,'906','83','86','151','136','140','160','158','128'
|
|
5,'1149','82','87','137','130','148','140','163','150'
|
|
4,'1043','83','87','128','126','140','139','161','134'
|
|
1,'1066','82','87','148','128','146','139','163','143'
|
|
7,'1125','82','87','140','140','145','140','163','143'
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpcsa aggregate function on all col types without group by
|
|
select
|
|
distinctpcsa(id),
|
|
distinctpcsa(bool_col),
|
|
distinctpcsa(tinyint_col),
|
|
distinctpcsa(smallint_col),
|
|
distinctpcsa(int_col),
|
|
distinctpcsa(bigint_col),
|
|
distinctpcsa(float_col),
|
|
distinctpcsa(double_col),
|
|
distinctpcsa(string_col),
|
|
distinctpcsa(timestamp_col)
|
|
from alltypesagg
|
|
---- TYPES
|
|
string, string, string, string, string, string, string, string, string, string
|
|
---- RESULTS
|
|
'17429','83','87','109','544','1213','867','1043','252','1031'
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpc aggregate function on all col types with group by
|
|
select
|
|
tinyint_col,
|
|
distinctpc(id),
|
|
distinctpc(bool_col),
|
|
distinctpc(smallint_col),
|
|
distinctpc(int_col),
|
|
distinctpc(bigint_col),
|
|
distinctpc(float_col),
|
|
distinctpc(double_col),
|
|
distinctpc(string_col),
|
|
distinctpc(timestamp_col)
|
|
from alltypesagg
|
|
group by tinyint_col
|
|
---- TYPES
|
|
tinyint, string, string, string, string, string, string, string, string, string
|
|
---- RESULTS
|
|
1,'896','1','11','20','89','112','88','151','108'
|
|
2,'936','1','11','20','100','153','94','180','102'
|
|
3,'936','1','11','20','109','118','90','182','103'
|
|
4,'977','1','14','20','81','84','83','190','101'
|
|
5,'977','1','14','20','96','97','100','151','102'
|
|
6,'998','1','13','20','97','128','101','178','102'
|
|
7,'998','1','13','20','86','117','105','158','97'
|
|
8,'896','1','13','20','88','96','101','155','93'
|
|
9,'896','1','13','20','109','142','109','182','100'
|
|
NULL,'896','1','11','20','91','97','85','176','98'
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpc aggregate function on all col types without group by
|
|
select
|
|
distinctpc(id),
|
|
distinctpc(bool_col),
|
|
distinctpc(tinyint_col),
|
|
distinctpc(smallint_col),
|
|
distinctpc(int_col),
|
|
distinctpc(bigint_col),
|
|
distinctpc(float_col),
|
|
distinctpc(double_col),
|
|
distinctpc(string_col),
|
|
distinctpc(timestamp_col)
|
|
from alltypesagg
|
|
---- TYPES
|
|
string, string, string, string, string, string, string, string, string, string
|
|
---- RESULTS
|
|
'10590','3','7','63','20','1295','936','1020','330','988'
|
|
====
|
|
---- QUERY
|
|
# Test the hll aggregate function on all col types without group by
|
|
select
|
|
ndv(id),
|
|
ndv(bool_col),
|
|
ndv(tinyint_col),
|
|
ndv(smallint_col),
|
|
ndv(int_col),
|
|
ndv(bigint_col),
|
|
ndv(float_col),
|
|
ndv(double_col),
|
|
ndv(string_col),
|
|
ndv(timestamp_col)
|
|
from alltypesagg
|
|
---- TYPES
|
|
string, string, string, string, string, string, string, string, string, string
|
|
---- RESULTS
|
|
'10549','2','9','104','1881','1342','944','991','1178','9880'
|
|
====
|
|
---- QUERY
|
|
# Test the distinctpc aggregate function on empty table
|
|
select distinctpc(field) from EmptyTable
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'0'
|
|
====
|
|
---- QUERY
|
|
# Test the hll aggregate function on empty table
|
|
select ndv(field) from EmptyTable
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'0'
|
|
====
|