Files
impala/testdata/workloads/functional-query/queries/QueryTest/distinct-estimate.test
Nong Li 7f08146b88 Add ndv (distinct estimate) as a builtin aggregate function.
This is implemented in the BE using HLL (but we could change this in the
future).

These estimates usually work better than the other algorithm we have and
we've not implemented all the improvements from the google paper.

Change-Id: Ied715ddd0e1a7cbe7f5f90469f1ed3d4b9c537c7
Reviewed-on: http://gerrit.ent.cloudera.com:8080/956
Reviewed-by: Nong Li <nong@cloudera.com>
Tested-by: jenkins
2014-01-08 10:54:03 -08:00

142 lines
4.1 KiB
Plaintext

# These computations take a while so we do not want to add too many test cases
# They also require a non-trivial amount of data to get reasonable results.
====
---- QUERY
select count(distinct id), distinctpc(id), distinctpcsa(id) from alltypesagg
---- TYPES
bigint, string, string
---- RESULTS
10000,'10590','17429'
====
---- QUERY
# Test the distinctpcsa aggregate function on all col types with group by
select
tinyint_col,
distinctpcsa(id),
distinctpcsa(bool_col),
distinctpcsa(smallint_col),
distinctpcsa(int_col),
distinctpcsa(bigint_col),
distinctpcsa(float_col),
distinctpcsa(double_col),
distinctpcsa(string_col),
distinctpcsa(timestamp_col)
from alltypesagg
group by tinyint_col
---- TYPES
tinyint, string, string, string, string, string, string, string, string, string
---- RESULTS
3,'1200','82','86','140','131','146','139','163','151'
NULL,'1101','83','87','142','131','133','142','158','140'
6,'998','83','87','137','128','146','142','161','143'
9,'867','82','88','130','131','139','136','161','114'
8,'977','83','88','139','124','139','142','163','139'
2,'906','83','86','151','136','140','160','158','128'
5,'1149','82','87','137','130','148','140','163','150'
4,'1043','83','87','128','126','140','139','161','134'
1,'1066','82','87','148','128','146','139','163','143'
7,'1125','82','87','140','140','145','140','163','143'
====
---- QUERY
# Test the distinctpcsa aggregate function on all col types without group by
select
distinctpcsa(id),
distinctpcsa(bool_col),
distinctpcsa(tinyint_col),
distinctpcsa(smallint_col),
distinctpcsa(int_col),
distinctpcsa(bigint_col),
distinctpcsa(float_col),
distinctpcsa(double_col),
distinctpcsa(string_col),
distinctpcsa(timestamp_col)
from alltypesagg
---- TYPES
string, string, string, string, string, string, string, string, string, string
---- RESULTS
'17429','83','87','109','544','1213','867','1043','252','1031'
====
---- QUERY
# Test the distinctpc aggregate function on all col types with group by
select
tinyint_col,
distinctpc(id),
distinctpc(bool_col),
distinctpc(smallint_col),
distinctpc(int_col),
distinctpc(bigint_col),
distinctpc(float_col),
distinctpc(double_col),
distinctpc(string_col),
distinctpc(timestamp_col)
from alltypesagg
group by tinyint_col
---- TYPES
tinyint, string, string, string, string, string, string, string, string, string
---- RESULTS
1,'896','1','11','20','89','112','88','151','108'
2,'936','1','11','20','100','153','94','180','102'
3,'936','1','11','20','109','118','90','182','103'
4,'977','1','14','20','81','84','83','190','101'
5,'977','1','14','20','96','97','100','151','102'
6,'998','1','13','20','97','128','101','178','102'
7,'998','1','13','20','86','117','105','158','97'
8,'896','1','13','20','88','96','101','155','93'
9,'896','1','13','20','109','142','109','182','100'
NULL,'896','1','11','20','91','97','85','176','98'
====
---- QUERY
# Test the distinctpc aggregate function on all col types without group by
select
distinctpc(id),
distinctpc(bool_col),
distinctpc(tinyint_col),
distinctpc(smallint_col),
distinctpc(int_col),
distinctpc(bigint_col),
distinctpc(float_col),
distinctpc(double_col),
distinctpc(string_col),
distinctpc(timestamp_col)
from alltypesagg
---- TYPES
string, string, string, string, string, string, string, string, string, string
---- RESULTS
'10590','3','7','63','20','1295','936','1020','330','988'
====
---- QUERY
# Test the hll aggregate function on all col types without group by
select
ndv(id),
ndv(bool_col),
ndv(tinyint_col),
ndv(smallint_col),
ndv(int_col),
ndv(bigint_col),
ndv(float_col),
ndv(double_col),
ndv(string_col),
ndv(timestamp_col)
from alltypesagg
---- TYPES
string, string, string, string, string, string, string, string, string, string
---- RESULTS
'10549','2','9','104','1881','1342','944','991','1178','9880'
====
---- QUERY
# Test the distinctpc aggregate function on empty table
select distinctpc(field) from EmptyTable
---- TYPES
string
---- RESULTS
'0'
====
---- QUERY
# Test the hll aggregate function on empty table
select ndv(field) from EmptyTable
---- TYPES
string
---- RESULTS
'0'
====