Files
impala/testdata/workloads/functional-query/queries/QueryTest/datasketches-theta.test
AlexanderSaydakov c925807b1a IMPALA-10901 cleaner and faster operations with datasketches
- serialize using bytes instead of stream
- avoid unnecessary constructor during deserialization
- simplified code slightly
- added original exception message to re-thrown generic message

Change-Id: I306a2489dac0f4d2d475e8f9987cd58bf95474bb
Reviewed-on: http://gerrit.cloudera.org:8080/17818
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2021-09-15 13:58:23 +00:00

562 lines
19 KiB
Plaintext

====
---- QUERY
# Use a small table for testing Datasketches Theta functions through Impala to make sure
# that these approximate functions give the correct result. For testing Impala
# functionality no need to test how Datasketches Theta approximates count distint values
# so a small table is enough.
select
ds_theta_estimate(ds_theta_sketch(tinyint_col)),
ds_theta_estimate(ds_theta_sketch(int_col)),
ds_theta_estimate(ds_theta_sketch(bigint_col)),
ds_theta_estimate(ds_theta_sketch(float_col)),
ds_theta_estimate(ds_theta_sketch(double_col)),
ds_theta_estimate(ds_theta_sketch(string_col))
from functional_parquet.alltypessmall;
---- RESULTS
10,10,10,10,10,10
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
select
ds_theta_sketch_and_estimate(tinyint_col),
ds_theta_sketch_and_estimate(int_col),
ds_theta_sketch_and_estimate(bigint_col),
ds_theta_sketch_and_estimate(float_col),
ds_theta_sketch_and_estimate(double_col),
ds_theta_sketch_and_estimate(string_col)
from functional_parquet.alltypessmall;
---- RESULTS
10,10,10,10,10,10
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Check that unsupported types give error with ds_theta_sketch().
select ds_theta_sketch(bool_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_theta_sketch(BOOLEAN)
====
---- QUERY
select ds_theta_sketch(smallint_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_theta_sketch(SMALLINT)
====
---- QUERY
select ds_theta_sketch(cast(date_string_col as date format 'MM/DD/YYYY'))
from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_theta_sketch(DATE)
====
---- QUERY
select ds_theta_sketch(d1) from functional_parquet.decimal_tbl;
---- CATCH
AnalysisException: No matching function with signature: ds_theta_sketch(DECIMAL(9,0))
====
---- QUERY
# Check that unsupported types give error with ds_theta_sketch_and_estimate().
select ds_theta_sketch_and_estimate(bool_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_theta_sketch_and_estimate(BOOLEAN)
====
---- QUERY
select ds_theta_sketch_and_estimate(smallint_col) from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_theta_sketch_and_estimate(SMALLINT)
====
---- QUERY
select ds_theta_sketch_and_estimate(cast(date_string_col as date format 'MM/DD/YYYY'))
from functional_parquet.alltypessmall;
---- CATCH
AnalysisException: No matching function with signature: ds_theta_sketch_and_estimate(DATE)
====
---- QUERY
select ds_theta_sketch_and_estimate(d1) from functional_parquet.decimal_tbl;
---- CATCH
AnalysisException: No matching function with signature: ds_theta_sketch_and_estimate(DECIMAL(9,0))
====
---- QUERY
# Check if Theta works with null values.
select
ds_theta_estimate(ds_theta_sketch(null_str)),
ds_theta_estimate(ds_theta_sketch(null_int)),
ds_theta_estimate(ds_theta_sketch(null_double)),
ds_theta_estimate(ds_theta_sketch(some_nulls)),
ds_theta_sketch_and_estimate(null_str),
ds_theta_sketch_and_estimate(null_int),
ds_theta_sketch_and_estimate(null_double),
ds_theta_sketch_and_estimate(some_nulls)
from functional_parquet.nullrows;
---- RESULTS
0,0,0,6,0,0,0,6
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Check if Theta works for empty datasets.
select
ds_theta_estimate(ds_theta_sketch(field)),
ds_theta_estimate(ds_theta_sketch(f2)),
ds_theta_sketch_and_estimate(field),
ds_theta_sketch_and_estimate(f2)
from functional_parquet.emptytable;
---- RESULTS
0,0,0,0
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT
====
---- QUERY
# Write sketches to a table as string and get an estimate from the written sketch.
# Note, the plan is to write sketches as binary instead of strings. For this we have to
# wait for the binary support (IMPALA-9482).
create table sketch_store
(year int, month int, date_sketch string, float_sketch string)
stored as parquet;
insert into sketch_store
select
year,
month,
ds_theta_sketch(date_string_col),
ds_theta_sketch(float_col)
from functional_parquet.alltypessmall
group by year, month;
select
year,
month,
ds_theta_estimate(date_sketch),
ds_theta_estimate(float_sketch)
from sketch_store order by month;
---- RESULTS
2009,1,3,10
2009,2,3,10
2009,3,3,10
2009,4,3,10
---- TYPES
INT,INT,BIGINT,BIGINT
====
---- QUERY
# Check that ds_theta_estimate returns error for strings that are not serialized sketches.
select ds_theta_estimate(date_string_col) from functional_parquet.alltypestiny;
---- CATCH
UDF ERROR: Unable to deserialize sketch: sketch type mismatch: expected 3, actual 47
====
---- QUERY
# Check that ds_theta_estimate returns error for HLL serialized sketches.
create table hll_sketch_store (date_sketch string) stored as parquet;
insert into hll_sketch_store
select ds_hll_sketch(date_string_col)
from functional_parquet.alltypessmall;
select ds_theta_estimate(date_sketch) from hll_sketch_store;
---- CATCH
UDF ERROR: Unable to deserialize sketch: sketch type mismatch: expected 3, actual 7
====
---- QUERY
# Check that ds_theta_estimate returns null for null and empty string inputs.
select ds_theta_estimate(b), ds_theta_estimate(c) from functional_parquet.nulltable;
---- RESULTS
0,0
---- TYPES
BIGINT,BIGINT
====
---- QUERY
# Check that sketches made by Hive can be read and used for estimating by Impala.
select
ds_theta_estimate(ti) as ti,
ds_theta_estimate(i) as i,
ds_theta_estimate(bi) as bi,
ds_theta_estimate(f) as f,
ds_theta_estimate(d) as d,
ds_theta_estimate(s) as s,
ds_theta_estimate(c) as c,
ds_theta_estimate(v) as v,
ds_theta_estimate(nc) as nc
from theta_sketches_from_hive;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,4,4,3,0
====
---- QUERY
# Unions the sketches from sketch_store and checks if the union produces the same result
# as if the whole data was sketched together into a single sketch.
select
ds_theta_estimate(ds_theta_union(date_sketch)),
ds_theta_estimate(ds_theta_union(float_sketch))
from sketch_store;
---- TYPES
BIGINT,BIGINT
---- RESULTS
12,10
====
---- QUERY
# Checks that ds_theta_union() produces NULL for an empty dataset.
select ds_theta_union(field) from functional_parquet.emptytable;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# Checks that ds_theta_union() produces NULL for NULL inputs.
select ds_theta_union(null_str) from functional_parquet.nullrows;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# ds_theta_union() returns an error if it receives an invalid serialized sketch.
select ds_theta_union(date_string_col) from functional_parquet.alltypestiny where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch: sketch type mismatch: expected 3, actual 47
====
---- QUERY
# Get the same sketches from Impala and Hive and put them into the same table. When we
# get the estimates from the unions of these sketches the expectation is to get the same
# results as if these sketches were used separately to get the estimates.
create table theta_sketches_impala_hive like theta_sketches_from_impala stored as parquet;
insert into theta_sketches_impala_hive select * from theta_sketches_from_hive;
insert into theta_sketches_impala_hive select * from theta_sketches_from_impala;
select
ds_theta_estimate(ds_theta_union(ti)) as ti,
ds_theta_estimate(ds_theta_union(i)) as i,
ds_theta_estimate(ds_theta_union(bi)) as bi,
ds_theta_estimate(ds_theta_union(f)) as f,
ds_theta_estimate(ds_theta_union(d)) as d,
ds_theta_estimate(ds_theta_union(s)) as s,
ds_theta_estimate(ds_theta_union(c)) as c,
ds_theta_estimate(ds_theta_union(v)) as v,
ds_theta_estimate(ds_theta_union(nc)) as nc
from theta_sketches_impala_hive;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,4,4,3,0
====
---- QUERY
# Intersect the sketches in sketch_store and check.
select
ds_theta_estimate(ds_theta_intersect(date_sketch)),
ds_theta_estimate(ds_theta_intersect(float_sketch))
from sketch_store;
---- TYPES
BIGINT,BIGINT
---- RESULTS
0,10
====
---- QUERY
# Checks that ds_theta_intersect() produces NULL for an empty dataset.
select ds_theta_intersect(field) from functional_parquet.emptytable;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# Checks that ds_theta_intersect() produces NULL for NULL inputs.
select ds_theta_intersect(null_str) from functional_parquet.nullrows;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# ds_theta_intersect() returns an error if it receives an invalid serialized sketch.
select ds_theta_intersect(date_string_col) from functional_parquet.alltypestiny
where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch: sketch type mismatch: expected 3, actual 47
====
---- QUERY
# Check that sketches made by Hive for intersection and estimation, which is consistent
# with direct estimation of these sketches.
select
ds_theta_estimate(ds_theta_intersect(ti)) as ti,
ds_theta_estimate(ds_theta_intersect(i)) as i,
ds_theta_estimate(ds_theta_intersect(bi)) as bi,
ds_theta_estimate(ds_theta_intersect(f)) as f,
ds_theta_estimate(ds_theta_intersect(d)) as d,
ds_theta_estimate(ds_theta_intersect(s)) as s,
ds_theta_estimate(ds_theta_intersect(c)) as c,
ds_theta_estimate(ds_theta_intersect(v)) as v,
ds_theta_estimate(ds_theta_intersect(nc)) as nc
from theta_sketches_from_hive;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,4,4,3,0
====
---- QUERY
# Intersect the sketches from theta_sketches_impala_hive and checks if the intersection
# produces the same result as if these sketches were used separately to get the estimates.
select
ds_theta_estimate(ds_theta_intersect(ti)) as ti,
ds_theta_estimate(ds_theta_intersect(i)) as i,
ds_theta_estimate(ds_theta_intersect(bi)) as bi,
ds_theta_estimate(ds_theta_intersect(f)) as f,
ds_theta_estimate(ds_theta_intersect(d)) as d,
ds_theta_estimate(ds_theta_intersect(s)) as s,
ds_theta_estimate(ds_theta_intersect(c)) as c,
ds_theta_estimate(ds_theta_intersect(v)) as v,
ds_theta_estimate(ds_theta_intersect(nc)) as nc
from theta_sketches_impala_hive;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,4,4,3,0
====
---- QUERY
# A and B refer to the two input sketches in the order A-not-B.
# Checks that ds_theta_exclude() returns an null sketch for A, B is NULL inputs.
select ds_theta_exclude(null_str, some_nulls) from functional_parquet.nullrows
where id='b';
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# Checks that ds_theta_exclude() returns an null sketch for A is NULL inputs.
select ds_theta_exclude(null_str, sketch) from (
select null null_str, ds_theta_sketch(some_nulls) sketch from functional_parquet.nullrows
where id='a'
) t;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# Check that ds_theta_exclude() returns an empty sketch When A is empty and B is null.
select ds_theta_estimate(ds_theta_exclude(ds_theta_sketch(f2), null))
from functional_parquet.emptytable;
---- TYPES
BIGINT
---- RESULTS
0
====
---- QUERY
# Check that ds_theta_exclude() returns an null sketch When A is null and B is empty.
select ds_theta_exclude(null, ds_theta_sketch(f2)) from functional_parquet.emptytable;
---- TYPES
STRING
---- RESULTS
'NULL'
====
---- QUERY
# Check that ds_theta_exclude() returns an empty sketch When A and B are both empty.
select ds_theta_estimate(ds_theta_exclude(ds_theta_sketch(field), ds_theta_sketch(f2)))
from functional_parquet.emptytable;
---- TYPES
BIGINT
---- RESULTS
0
====
---- QUERY
# ds_theta_exclude() returns an error if it receives an invalid serialized sketch.
select ds_theta_exclude(date_string_col, date_string_col) from functional_parquet.alltypestiny
where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch: sketch type mismatch: expected 3, actual 47
====
---- QUERY
# Get the same sketches from Impala and Hive and put them into the same table (different
# column prefix). The estimate in sketch A but not in sketch B is 0.
create table theta_sketches_impala_hive2 (
i_ti string,i_i string,i_bi string,i_f string,i_d string,i_s string,i_c string,i_v string,
i_nc string,
h_ti string,h_i string,h_bi string,h_f string,h_d string,h_s string,h_c string,h_v string,
h_nc string) stored as parquet;
insert overwrite theta_sketches_impala_hive2 select
i.ti i_ti, i.i i_i, i.bi i_bi, i.f i_f, i.d i_d, i.s i_s, i.c i_c, i.v i_v,i.nc i_nc,
h.ti h_ti, h.i h_i, h.bi h_bi, h.f h_f, h.d h_d, h.s h_s, h.c h_c, h.v h_v,h.nc h_nc
from theta_sketches_from_impala i, theta_sketches_from_hive h;
select
ds_theta_estimate(ds_theta_exclude(i_ti, h_ti)) as ti,
ds_theta_estimate(ds_theta_exclude(i_i, h_i)) as i,
ds_theta_estimate(ds_theta_exclude(i_bi, h_bi)) as bi,
ds_theta_estimate(ds_theta_exclude(i_f, h_f)) as f,
ds_theta_estimate(ds_theta_exclude(i_d, h_d)) as d,
ds_theta_estimate(ds_theta_exclude(i_s, h_s)) as s,
ds_theta_estimate(ds_theta_exclude(i_c, h_c)) as c,
ds_theta_estimate(ds_theta_exclude(i_v, h_v)) as v,
ds_theta_estimate(ds_theta_exclude(i_nc, h_nc)) as nc
from theta_sketches_impala_hive2;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
0,0,0,0,0,0,0,0,0
====
---- QUERY
# Check that the result of an a-not-b is a non-empty sketch.
create table sketch_input (id1 int, id2 int);
insert into table sketch_input values
(1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20);
create table sketch_intermediate (sketch1 string, sketch2 string) stored as parquet;
insert overwrite sketch_intermediate
select ds_theta_sketch(id1), ds_theta_sketch(id2) from sketch_input;
select ds_theta_estimate(ds_theta_exclude(sketch1, sketch2)) from sketch_intermediate;
---- TYPES
BIGINT
---- RESULTS
5
====
---- QUERY
# Checks that unioning a valid sketch with a null value result the valid sketch being
# returned.
select
ds_theta_estimate(ds_theta_union_f(date_sketch, null)),
ds_theta_estimate(ds_theta_union_f(null, float_sketch))
from sketch_store;
---- TYPES
BIGINT,BIGINT
---- RESULTS
3,10
3,10
3,10
3,10
====
---- QUERY
# Check that ds_theta_union_f() returns an empty sketch for an empty sketch.
select ds_theta_estimate(ds_theta_union_f(ds_theta_sketch(cast(f2 as float)), null))
from functional_parquet.emptytable;
---- TYPES
BIGINT
---- RESULTS
0
====
---- QUERY
# Checks that ds_theta_union_f() returns an empty sketch for NULL inputs.
select ds_theta_estimate(ds_theta_union_f(null_str, some_nulls)) from
functional_parquet.nullrows where id='b';
---- TYPES
BIGINT
---- RESULTS
0
====
---- QUERY
# ds_theta_union_f() returns an error if it receives an invalid serialized sketch.
select ds_theta_union_f(null, date_string_col) from functional_parquet.alltypestiny
where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch: sketch type mismatch: expected 3, actual 47
====
---- QUERY
# ds_theta_union_f() returns an error if it receives an invalid serialized sketch.
select ds_theta_union_f(date_string_col, null) from functional_parquet.alltypestiny
where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch: sketch type mismatch: expected 3, actual 47
====
---- QUERY
# Unions the sketches from theta_sketches_impala_hive2 and checks if the union produces
# the same result as if these sketches were used separately to get the estimates.
select
ds_theta_estimate(ds_theta_union_f(i_ti, h_ti)) as ti,
ds_theta_estimate(ds_theta_union_f(i_i, h_i)) as i,
ds_theta_estimate(ds_theta_union_f(i_bi, h_bi)) as bi,
ds_theta_estimate(ds_theta_union_f(i_f, h_f)) as f,
ds_theta_estimate(ds_theta_union_f(i_d, h_d)) as d,
ds_theta_estimate(ds_theta_union_f(i_s, h_s)) as s,
ds_theta_estimate(ds_theta_union_f(i_c, h_c)) as c,
ds_theta_estimate(ds_theta_union_f(i_v, h_v)) as v,
ds_theta_estimate(ds_theta_union_f(i_nc, h_nc)) as nc
from theta_sketches_impala_hive2;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,4,4,3,0
====
---- QUERY
# Union two sketches from different columns of sketch_intermediate.
select ds_theta_estimate(ds_theta_union_f(sketch1, sketch2)) from sketch_intermediate;
---- TYPES
BIGINT
---- RESULTS
15
====
---- QUERY
# Check if the intersection of a valid sketch with a null value returns an null sketch.
select
ds_theta_estimate(ds_theta_intersect_f(date_sketch, null)),
ds_theta_estimate(ds_theta_intersect_f(null, float_sketch))
from sketch_store;
---- TYPES
BIGINT,BIGINT
---- RESULTS
0,0
0,0
0,0
0,0
====
---- QUERY
# Check that ds_theta_intersect_f() returns an empty sketch for 2 empty sketch.
select ds_theta_estimate(ds_theta_intersect_f(
ds_theta_sketch(cast(f2 as float)), ds_theta_sketch(cast(f2 as float))))
from functional_parquet.emptytable;
---- TYPES
BIGINT
---- RESULTS
0
====
---- QUERY
# Checks that ds_theta_intersect_f() returns an null sketch for 2 null inputs.
select ds_theta_estimate(ds_theta_intersect_f(null_str, some_nulls)) from
functional_parquet.nullrows where id='b';
---- TYPES
BIGINT
---- RESULTS
0
====
---- QUERY
# ds_theta_intersect_f() returns an error if it receives an invalid serialized sketch.
select ds_theta_intersect_f(date_string_col, null) from functional_parquet.alltypestiny
where id=1;
---- CATCH
UDF ERROR: Unable to deserialize sketch: sketch type mismatch: expected 3, actual 47
====
---- QUERY
# ds_theta_intersect_f() returns an error if it receives an invalid serialized sketch.
select ds_theta_intersect_f(sketch1, sketch2) from (
select ds_theta_sketch(float_col) sketch1, max(date_string_col) sketch2 from
functional_parquet.alltypestiny
) t
---- CATCH
UDF ERROR: Unable to deserialize sketch: sketch type mismatch: expected 3, actual 47
====
---- QUERY
# Intersect the sketches from theta_sketches_impala_hive2 and checks if the intersect
# produces the same result as if these sketches were used separately to get the estimates.
select
ds_theta_estimate(ds_theta_intersect_f(i_ti, h_ti)) as ti,
ds_theta_estimate(ds_theta_intersect_f(i_i, h_i)) as i,
ds_theta_estimate(ds_theta_intersect_f(i_bi, h_bi)) as bi,
ds_theta_estimate(ds_theta_intersect_f(i_f, h_f)) as f,
ds_theta_estimate(ds_theta_intersect_f(i_d, h_d)) as d,
ds_theta_estimate(ds_theta_intersect_f(i_s, h_s)) as s,
ds_theta_estimate(ds_theta_intersect_f(i_c, h_c)) as c,
ds_theta_estimate(ds_theta_intersect_f(i_v, h_v)) as v,
ds_theta_estimate(ds_theta_intersect_f(i_nc, h_nc)) as nc
from theta_sketches_impala_hive2;
---- TYPES
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,4,4,3,0
====
---- QUERY
# Check that the non-empty input sketches are distinct and the result is empty.
select ds_theta_estimate(ds_theta_intersect_f(date_sketch, float_sketch))
from sketch_store where year=2009 and month=1;
---- TYPES
BIGINT
---- RESULTS
0
====
---- QUERY
# Check When the inputs aren't the same but has some (but not all) items common.
select ds_theta_estimate(ds_theta_intersect_f(sketch1, sketch2)) from sketch_intermediate;
---- TYPES
BIGINT
---- RESULTS
5
====