mirror of
https://github.com/apache/impala.git
synced 2026-02-01 21:00:29 -05:00
- serialize using bytes instead of stream - avoid unnecessary constructor during deserialization - simplified code slightly - added original exception message to re-thrown generic message Change-Id: I306a2489dac0f4d2d475e8f9987cd58bf95474bb Reviewed-on: http://gerrit.cloudera.org:8080/17818 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
423 lines
14 KiB
Plaintext
423 lines
14 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Use a small table for testing Datasketches HLL functions through Impala to make sure
|
|
# that these approximate functions give the correct result. For testing Impala
|
|
# functionality no need to test how Datasketches HLL approximates count distint values
|
|
# so a small table is enough.
|
|
select
|
|
ds_hll_estimate(ds_hll_sketch(tinyint_col)),
|
|
ds_hll_estimate(ds_hll_sketch(int_col)),
|
|
ds_hll_estimate(ds_hll_sketch(bigint_col)),
|
|
ds_hll_estimate(ds_hll_sketch(float_col)),
|
|
ds_hll_estimate(ds_hll_sketch(double_col)),
|
|
ds_hll_estimate(ds_hll_sketch(string_col))
|
|
from functional_parquet.alltypessmall
|
|
---- RESULTS
|
|
10,10,10,10,10,10
|
|
---- TYPES
|
|
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
|
|
====
|
|
---- QUERY
|
|
select
|
|
ds_hll_sketch_and_estimate(tinyint_col),
|
|
ds_hll_sketch_and_estimate(int_col),
|
|
ds_hll_sketch_and_estimate(bigint_col),
|
|
ds_hll_sketch_and_estimate(float_col),
|
|
ds_hll_sketch_and_estimate(double_col),
|
|
ds_hll_sketch_and_estimate(string_col)
|
|
from functional_parquet.alltypessmall
|
|
---- RESULTS
|
|
10,10,10,10,10,10
|
|
---- TYPES
|
|
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
|
|
====
|
|
---- QUERY
|
|
# Check that unsupported types give error with ds_hll_sketch().
|
|
select ds_hll_sketch(bool_col) from functional_parquet.alltypessmall;
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature: ds_hll_sketch(BOOLEAN)
|
|
====
|
|
---- QUERY
|
|
select ds_hll_sketch(smallint_col) from functional_parquet.alltypessmall;
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature: ds_hll_sketch(SMALLINT)
|
|
====
|
|
---- QUERY
|
|
select ds_hll_sketch(cast(date_string_col as date format 'MM/DD/YYYY'))
|
|
from functional_parquet.alltypessmall;
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature: ds_hll_sketch(DATE)
|
|
====
|
|
---- QUERY
|
|
select ds_hll_sketch(d1) from functional_parquet.decimal_tbl;
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature: ds_hll_sketch(DECIMAL(9,0))
|
|
====
|
|
---- QUERY
|
|
# Check that unsupported types give error with ds_hll_sketch_and_estimate().
|
|
select ds_hll_sketch_and_estimate(bool_col) from functional_parquet.alltypessmall;
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(BOOLEAN)
|
|
====
|
|
---- QUERY
|
|
select ds_hll_sketch_and_estimate(smallint_col) from functional_parquet.alltypessmall;
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(SMALLINT)
|
|
====
|
|
---- QUERY
|
|
select ds_hll_sketch_and_estimate(cast(date_string_col as date format 'MM/DD/YYYY'))
|
|
from functional_parquet.alltypessmall;
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(DATE)
|
|
====
|
|
---- QUERY
|
|
select ds_hll_sketch_and_estimate(d1) from functional_parquet.decimal_tbl;
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature: ds_hll_sketch_and_estimate(DECIMAL(9,0))
|
|
====
|
|
---- QUERY
|
|
# Check if HLL works with null values.
|
|
select
|
|
ds_hll_estimate(ds_hll_sketch(null_str)),
|
|
ds_hll_estimate(ds_hll_sketch(null_int)),
|
|
ds_hll_estimate(ds_hll_sketch(null_double)),
|
|
ds_hll_estimate(ds_hll_sketch(some_nulls)),
|
|
ds_hll_sketch_and_estimate(null_str),
|
|
ds_hll_sketch_and_estimate(null_int),
|
|
ds_hll_sketch_and_estimate(null_double),
|
|
ds_hll_sketch_and_estimate(some_nulls)
|
|
from functional_parquet.nullrows
|
|
---- RESULTS
|
|
NULL,NULL,NULL,6,NULL,NULL,NULL,6
|
|
---- TYPES
|
|
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
|
|
====
|
|
---- QUERY
|
|
# Check if HLL works for empty datasets.
|
|
select
|
|
ds_hll_estimate(ds_hll_sketch(field)),
|
|
ds_hll_estimate(ds_hll_sketch(f2)),
|
|
ds_hll_sketch_and_estimate(field),
|
|
ds_hll_sketch_and_estimate(f2)
|
|
from functional_parquet.emptytable
|
|
---- RESULTS
|
|
NULL,NULL,NULL,NULL
|
|
---- TYPES
|
|
BIGINT,BIGINT,BIGINT,BIGINT
|
|
====
|
|
---- QUERY
|
|
# Write sketches to a table as string and get an estimate from the written sketch.
|
|
# Note, the plan is to write sketches as binary instead of strings. For this we have to
|
|
# wait for the binary support (IMPALA-9482).
|
|
create table sketch_store
|
|
(year int, month int, date_sketch string, float_sketch string)
|
|
stored as parquet;
|
|
insert into sketch_store
|
|
select
|
|
year,
|
|
month,
|
|
ds_hll_sketch(date_string_col),
|
|
ds_hll_sketch(float_col)
|
|
from functional_parquet.alltypessmall
|
|
group by year, month;
|
|
select
|
|
year,
|
|
month,
|
|
ds_hll_estimate(date_sketch),
|
|
ds_hll_estimate(float_sketch)
|
|
from sketch_store;
|
|
---- RESULTS
|
|
2009,1,3,10
|
|
2009,2,3,10
|
|
2009,3,3,10
|
|
2009,4,3,10
|
|
---- TYPES
|
|
INT,INT,BIGINT,BIGINT
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_estimate returns error for strings that are not serialized sketches.
|
|
select ds_hll_estimate(date_string_col) from functional_parquet.alltypestiny;
|
|
---- CATCH
|
|
UDF ERROR: Unable to deserialize sketch: Attempt to deserialize unknown object type
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_estimate returns null for null and empty string inputs.
|
|
select ds_hll_estimate(b), ds_hll_estimate(c) from functional_parquet.nulltable;
|
|
---- RESULTS
|
|
NULL,NULL
|
|
---- TYPES
|
|
BIGINT,BIGINT
|
|
====
|
|
---- QUERY
|
|
# Check that sketches made by Hive can be read and used for estimating by Impala.
|
|
select
|
|
ds_hll_estimate(ti) as ti,
|
|
ds_hll_estimate(i) as i,
|
|
ds_hll_estimate(bi) as bi,
|
|
ds_hll_estimate(f) as f,
|
|
ds_hll_estimate(d) as d,
|
|
ds_hll_estimate(s) as s,
|
|
ds_hll_estimate(c) as c,
|
|
ds_hll_estimate(v) as v,
|
|
ds_hll_estimate(nc) as nc
|
|
from hll_sketches_from_hive;
|
|
---- TYPES
|
|
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
|
|
---- RESULTS
|
|
5,7,6,5,6,4,3,3,NULL
|
|
====
|
|
---- QUERY
|
|
# Unions the sketches from sketch_store and checks if the union produces the same result
|
|
# as if the whole data was sketched together into a single sketch.
|
|
select
|
|
ds_hll_estimate(ds_hll_union(date_sketch)),
|
|
ds_hll_estimate(ds_hll_union(float_sketch))
|
|
from sketch_store;
|
|
---- TYPES
|
|
BIGINT,BIGINT
|
|
---- RESULTS
|
|
12,10
|
|
====
|
|
---- QUERY
|
|
# Checks that ds_hll_union() produces NULL for an empty dataset.
|
|
select ds_hll_union(field) from functional_parquet.emptytable;
|
|
---- TYPES
|
|
STRING
|
|
---- RESULTS
|
|
'NULL'
|
|
====
|
|
---- QUERY
|
|
# Checks that ds_hll_union() produces NULL for NULL inputs.
|
|
select ds_hll_union(null_str) from functional_parquet.nullrows;
|
|
---- TYPES
|
|
STRING
|
|
---- RESULTS
|
|
'NULL'
|
|
====
|
|
---- QUERY
|
|
# ds_hll_union() returns an error if it receives an invalid serialized sketch.
|
|
select ds_hll_union(date_string_col) from functional_parquet.alltypestiny where id=1;
|
|
---- CATCH
|
|
UDF ERROR: Unable to deserialize sketch: Attempt to deserialize unknown object type
|
|
====
|
|
---- QUERY
|
|
# Get the same sketches from Impala and Hive and put them into the same table. When we
|
|
# get the estimates from the unions of these sketches the expectation is to get the same
|
|
# results as if these sketches were used separately to get the estimates. However, for
|
|
# string types (STRING, CHAR, VARCHAR) we see the numbers doubling up because of the
|
|
# difference between how Impala and Hive uses these types. See IMPALA-9939 for further
|
|
# details.
|
|
create table hll_sketches_impala_hive like hll_sketches_from_impala stored as parquet;
|
|
insert into hll_sketches_impala_hive select * from hll_sketches_from_hive;
|
|
insert into hll_sketches_impala_hive select * from hll_sketches_from_impala;
|
|
select
|
|
ds_hll_estimate(ds_hll_union(ti)) as ti,
|
|
ds_hll_estimate(ds_hll_union(i)) as i,
|
|
ds_hll_estimate(ds_hll_union(bi)) as bi,
|
|
ds_hll_estimate(ds_hll_union(f)) as f,
|
|
ds_hll_estimate(ds_hll_union(d)) as d,
|
|
ds_hll_estimate(ds_hll_union(s)) as s,
|
|
ds_hll_estimate(ds_hll_union(c)) as c,
|
|
ds_hll_estimate(ds_hll_union(v)) as v,
|
|
ds_hll_estimate(ds_hll_union(nc)) as nc
|
|
from hll_sketches_impala_hive;
|
|
---- TYPES
|
|
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
|
|
---- RESULTS
|
|
5,7,6,5,6,8,6,6,NULL
|
|
====
|
|
---- QUERY
|
|
# IMPALA-9942: DataSketches HLL shouldn't take empty strings as distinct values
|
|
create table empty_string (s string, v varchar(1), c char(1));
|
|
insert into empty_string values
|
|
("", cast("" as varchar(1)), cast("" as char(1))),
|
|
("a", cast("a" as varchar(1)), cast("a" as char(1))),
|
|
("", cast("" as varchar(1)), cast("" as char(1))),
|
|
("b", cast("b" as varchar(1)), cast("b" as char(1))),
|
|
("b", cast("b" as varchar(1)), cast("b" as char(1)));
|
|
# Check if HLL works with empty strings.
|
|
select
|
|
ds_hll_estimate(ds_hll_sketch(s)),
|
|
ds_hll_estimate(ds_hll_sketch(v)),
|
|
ds_hll_estimate(ds_hll_sketch(c)),
|
|
ds_hll_sketch_and_estimate(s),
|
|
ds_hll_sketch_and_estimate(v),
|
|
ds_hll_sketch_and_estimate(c)
|
|
from empty_string
|
|
---- RESULTS
|
|
2,2,3,2,2,3
|
|
---- TYPES
|
|
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_stringify() returns null for an empty sketch.
|
|
select ds_hll_stringify(ds_kll_sketch(cast(f2 as float))) from functional_parquet.emptytable;
|
|
---- RESULTS
|
|
'NULL'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_stringify() returns null for a null input.
|
|
select ds_hll_stringify(c) from functional_parquet.nulltable;
|
|
---- RESULTS
|
|
'NULL'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_stringify() returns error for strings that are not serialized sketches.
|
|
select ds_hll_stringify(date_string_col) from functional_parquet.alltypestiny;
|
|
---- CATCH
|
|
UDF ERROR: Unable to deserialize sketch
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_stringify() works on sketches created by Hive.
|
|
select ds_hll_stringify(f) from hll_sketches_from_hive;
|
|
---- RESULTS
|
|
row_regex: .*### HLL sketch summary:.*Log Config K.*Hll Target.*Current Mode.*LB.*### End HLL sketch summary.*
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
select ds_hll_stringify(ds_hll_sketch(float_col)) from functional_parquet.alltypestiny;
|
|
---- RESULTS
|
|
row_regex: .*### HLL sketch summary:.*Log Config K.*Hll Target.*Current Mode.*LB.*### End HLL sketch summary.*
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_estimate_bounds_as_string() returns error for strings that are not
|
|
# serialized sketches.
|
|
select ds_hll_estimate_bounds_as_string(date_string_col) from functional_parquet.alltypestiny;
|
|
---- CATCH
|
|
UDF ERROR: Unable to deserialize sketch: Attempt to deserialize unknown object type
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_estimate_bounds_as_string() returns error for kappa is not 1, 2, 3.
|
|
select ds_hll_estimate_bounds_as_string(date_string_col, 4) from functional_parquet.alltypestiny;
|
|
---- CATCH
|
|
UDF ERROR: Kappa must be 1, 2 or 3
|
|
====
|
|
---- QUERY
|
|
select ds_hll_estimate_bounds_as_string(date_string_col, 1.5) from functional_parquet.alltypestiny;
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature: ds_hll_estimate_bounds_as_string(STRING, DECIMAL(2,1)).
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_estimate_bounds_as_string() works on sketches created by Hive.
|
|
select ds_hll_estimate_bounds_as_string(f) from hll_sketches_from_hive;
|
|
---- RESULTS
|
|
'5,5,5.0005'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
select ds_hll_estimate_bounds_as_string(f, 2) from hll_sketches_from_hive;
|
|
---- RESULTS
|
|
'5,5,5.0005'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
select ds_hll_estimate_bounds_as_string(f, 1) from hll_sketches_from_hive;
|
|
---- RESULTS
|
|
'5,5,5.00025'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
select ds_hll_estimate_bounds_as_string(ds_hll_sketch(float_col))
|
|
from functional_parquet.alltypestiny;
|
|
---- RESULTS
|
|
'2,2,2.0002'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
select ds_hll_estimate_bounds_as_string(ds_hll_sketch(float_col), 3)
|
|
from functional_parquet.alltypestiny;
|
|
---- RESULTS
|
|
'2,2,2.0003'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Checks that unioning a valid sketch with a null value result the valid sketch being returned.
|
|
select
|
|
ds_hll_estimate(ds_hll_union_f(date_sketch, null)),
|
|
ds_hll_estimate(ds_hll_union_f(null, float_sketch))
|
|
from sketch_store;
|
|
---- TYPES
|
|
BIGINT,BIGINT
|
|
---- RESULTS
|
|
3,10
|
|
3,10
|
|
3,10
|
|
3,10
|
|
====
|
|
---- QUERY
|
|
# Check that ds_hll_union_f() returns an empty sketch for an empty sketch.
|
|
select ds_hll_estimate(ds_hll_union_f(ds_hll_sketch(cast(f2 as float)), null))
|
|
from functional_parquet.emptytable;
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
0
|
|
====
|
|
---- QUERY
|
|
# Checks that ds_hll_union_f() returns an empty sketch for NULL inputs.
|
|
select ds_hll_estimate(ds_hll_union_f(null_str, some_nulls)) from functional_parquet.nullrows
|
|
where id='b';
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
0
|
|
====
|
|
---- QUERY
|
|
# ds_hll_union_f() returns an error if it receives an invalid serialized sketch.
|
|
select ds_hll_union_f(null, date_string_col) from functional_parquet.alltypestiny
|
|
where id=1;
|
|
---- CATCH
|
|
UDF ERROR: Unable to deserialize sketch: Attempt to deserialize unknown object type
|
|
====
|
|
---- QUERY
|
|
# ds_hll_union_f() returns an error if it receives an invalid serialized sketch.
|
|
select ds_hll_union_f(date_string_col, null) from functional_parquet.alltypestiny
|
|
where id=1;
|
|
---- CATCH
|
|
UDF ERROR: Unable to deserialize sketch: Attempt to deserialize unknown object type
|
|
====
|
|
---- QUERY
|
|
# Get the same sketches from Impala and Hive and put them into the same table (different
|
|
# column prefix). When we get the estimates from the unions of these sketches the expectation
|
|
# is to get the same results as if these sketches were used separately to get the estimates.
|
|
# However, for string types (STRING, CHAR, VARCHAR) we see the numbers doubling up because
|
|
# of the difference between how Impala and Hive uses these types. See IMPALA-9939 for further
|
|
# details.
|
|
create table hll_sketches_impala_hive2 (
|
|
i_ti string,i_i string,i_bi string,i_f string,i_d string,i_s string,i_c string,i_v string,
|
|
i_nc string,
|
|
h_ti string,h_i string,h_bi string,h_f string,h_d string,h_s string,h_c string,h_v string,
|
|
h_nc string) stored as parquet;
|
|
insert overwrite hll_sketches_impala_hive2 select
|
|
i.ti i_ti, i.i i_i, i.bi i_bi, i.f i_f, i.d i_d, i.s i_s, i.c i_c, i.v i_v,i.nc i_nc,
|
|
h.ti h_ti, h.i h_i, h.bi h_bi, h.f h_f, h.d h_d, h.s h_s, h.c h_c, h.v h_v,h.nc h_nc
|
|
from hll_sketches_from_impala i, hll_sketches_from_hive h;
|
|
select
|
|
ds_hll_estimate(ds_hll_union_f(i_ti, h_ti)) as ti,
|
|
ds_hll_estimate(ds_hll_union_f(i_i, h_i)) as i,
|
|
ds_hll_estimate(ds_hll_union_f(i_bi, h_bi)) as bi,
|
|
ds_hll_estimate(ds_hll_union_f(i_f, h_f)) as f,
|
|
ds_hll_estimate(ds_hll_union_f(i_d, h_d)) as d,
|
|
ds_hll_estimate(ds_hll_union_f(i_s, h_s)) as s,
|
|
ds_hll_estimate(ds_hll_union_f(i_c, h_c)) as c,
|
|
ds_hll_estimate(ds_hll_union_f(i_v, h_v)) as v,
|
|
ds_hll_estimate(ds_hll_union_f(i_nc, h_nc)) as nc
|
|
from hll_sketches_impala_hive2;
|
|
---- TYPES
|
|
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
|
|
---- RESULTS
|
|
5,7,6,5,6,8,6,6,0
|
|
====
|