mirror of
https://github.com/apache/impala.git
synced 2026-01-13 18:00:19 -05:00
This patch adds support for BINARY columns for all table formats with the exception of Kudu. In Hive the main difference between STRING and BINARY is that STRING is assumed to be UTF8 encoded, while BINARY can be any byte array. Some other differences in Hive: - BINARY can be only cast from/to STRING - Only a small subset of built-in STRING functions support BINARY. - In several file formats (e.g. text) BINARY is base64 encoded. - No NDV is calculated during COMPUTE STATISTICS. As Impala doesn't treat STRINGs as UTF8, BINARY and STRING become nearly identical, especially from the backend's perspective. For this reason, BINARY is implemented a bit differently compared to other types: while the frontend treats STRING and BINARY as two separate types, most of the backend uses PrimitiveType::TYPE_STRING for BINARY too, e.g. in SlotDesc. Only the following parts of backend need to differentiate between STRING and BINARY: - table scanners - table writers - HS2/Beeswax service These parts have access to column metadata, which allows to add special handling for BINARY. Only a very few builtins are allowed for BINARY at the moment: - length - min/max/count - coalesce and similar "selector" functions Other STRING functions can be only used by casting to STRING first. Adding support for more of these functions is very easy, as simply the BINARY type has to be "connected" to the already existing STRING function's signature. Functions where the result depends on utf8_mode need to ensure that with BINARY it always works as if utf8_mode=0 (for example length() is mapped to bytes() as length count utf8 chars if utf8_mode=1). All kinds of UDFs (native, Hive legacy, Hive generic) support BINARY, though in case of legacy Hive UDFs it is only supported if the argument and return types are set explicitely to ensure backward compatibility. See IMPALA-11340 for details. The original plan was to behave as close to Hive as possible, but I realized that Hive has more relaxed casting rules than Impala, which led to STRING<->BINARY casts being necessary in more cases in Impala. This was needed to disallow passing a BINARY to functions that expect a STRING argument. An example for the difference is that in INSERT ... VALUES () string literals need to be explicitly cast to BINARY, while this is not needed in Hive. Testing: - Added functional.binary_tbl for all file formats (except Kudu) to test scanning. - Removed functional.unsupported_types and related tests, as now Impala supports all (non-complex) types that Hive does. - Added FE/EE tests mainly based on the ones added to the DATE type Change-Id: I36861a9ca6c2047b0d76862507c86f7f153bc582 Reviewed-on: http://gerrit.cloudera.org:8080/16066 Reviewed-by: Quanlong Huang <huangquanlong@gmail.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
370 lines
8.4 KiB
Plaintext
370 lines
8.4 KiB
Plaintext
====
|
|
---- QUERY
|
|
select hive_pi()
|
|
---- RESULTS
|
|
3.141592653589793
|
|
---- TYPES
|
|
DOUBLE
|
|
====
|
|
---- QUERY
|
|
select hive_bin(100)
|
|
---- RESULTS
|
|
'1100100'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
select min(hive_pi()) from functional.alltypesagg
|
|
---- RESULTS
|
|
3.141592653589793
|
|
---- TYPES
|
|
DOUBLE
|
|
====
|
|
---- QUERY
|
|
# Test identity functions
|
|
select identity(true), identity(cast(NULL as boolean));
|
|
---- TYPES
|
|
boolean, boolean
|
|
---- RESULTS
|
|
true,NULL
|
|
====
|
|
---- QUERY
|
|
select identity(cast(10 as tinyint)), identity(cast(NULL as tinyint));
|
|
---- TYPES
|
|
tinyint, tinyint
|
|
---- RESULTS
|
|
10,NULL
|
|
====
|
|
---- QUERY
|
|
select identity(cast(10 as smallint)), identity(cast(NULL as smallint));
|
|
---- TYPES
|
|
smallint, smallint
|
|
---- RESULTS
|
|
10,NULL
|
|
====
|
|
---- QUERY
|
|
select identity(cast(10 as int)), identity(cast(NULL as int));
|
|
---- TYPES
|
|
int, int
|
|
---- RESULTS
|
|
10,NULL
|
|
====
|
|
---- QUERY
|
|
select identity(cast(10 as bigint)), identity(cast(NULL as bigint));
|
|
---- TYPES
|
|
bigint, bigint
|
|
---- RESULTS
|
|
10,NULL
|
|
====
|
|
---- QUERY
|
|
select identity(cast(10.0 as float)), identity(cast(NULL as float));
|
|
---- TYPES
|
|
float, float
|
|
---- RESULTS
|
|
10,NULL
|
|
====
|
|
---- QUERY
|
|
select identity(cast(10.0 as double)), identity(cast(NULL as double));
|
|
---- TYPES
|
|
double, double
|
|
---- RESULTS
|
|
10,NULL
|
|
====
|
|
---- QUERY
|
|
# IMPALA-1456. Each "identity" call below tests a different type (BytesWritable, Text,
|
|
# and String).
|
|
select identity("why hello there"),
|
|
identity("why", " hello there"),
|
|
identity("why", " hello", " there"),
|
|
identity(cast(NULL as string)),
|
|
identity(cast(NULL as string), cast(NULL as string)),
|
|
identity(cast(NULL as string), cast(NULL as string), cast(NULL as string));
|
|
---- TYPES
|
|
string, string, string, string, string, string
|
|
---- RESULTS
|
|
'why hello there','why hello there','why hello there','NULL','NULL','NULL'
|
|
====
|
|
---- QUERY
|
|
select identity(cast("a" as binary)), identity(cast(NULL as binary));
|
|
---- TYPES
|
|
binary, binary
|
|
---- RESULTS
|
|
'a','NULL'
|
|
====
|
|
---- QUERY
|
|
# IMPALA-1134. Each "identity" call below tests a different type (BytesWritable, Text,
|
|
# and String). The different types are handled slightly differently.
|
|
select length(identity("0123456789")),
|
|
length(identity("0123456789", "0123456789")),
|
|
length(identity("0123456789", "0123456789", "0123456789"));
|
|
---- TYPES
|
|
int, int, int
|
|
---- RESULTS
|
|
10,20,30
|
|
====
|
|
---- QUERY
|
|
# IMPALA-1392: Hive UDFs that throw exceptions should return NULL
|
|
select throws_exception();
|
|
---- TYPES
|
|
boolean
|
|
---- RESULTS
|
|
NULL
|
|
====
|
|
---- QUERY
|
|
set abort_java_udf_on_exception=true;
|
|
select throws_exception() from functional.alltypestiny;
|
|
---- CATCH
|
|
Test exception
|
|
====
|
|
---- QUERY
|
|
select throws_exception() from functional.alltypestiny;
|
|
---- TYPES
|
|
boolean
|
|
---- RESULTS
|
|
NULL
|
|
NULL
|
|
NULL
|
|
NULL
|
|
NULL
|
|
NULL
|
|
NULL
|
|
NULL
|
|
====
|
|
---- QUERY
|
|
select hive_add(cast(1 as int), cast(2 as int));
|
|
---- TYPES
|
|
int
|
|
---- RESULTS
|
|
3
|
|
====
|
|
---- QUERY
|
|
select hive_add(hive_add(cast(1 as int), cast(2 as int)), cast(2 as int));
|
|
---- TYPES
|
|
int
|
|
---- RESULTS
|
|
5
|
|
====
|
|
---- QUERY
|
|
select hive_add(cast(hive_add(cast(1 as int), cast(2 as int)) - hive_add(cast(2 as int), cast(1 as int)) as int), cast(2 as int));
|
|
---- TYPES
|
|
int
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select hive_add(cast(1 as smallint), cast(2 as smallint));
|
|
---- TYPES
|
|
smallint
|
|
---- RESULTS
|
|
3
|
|
====
|
|
---- QUERY
|
|
select hive_add(cast(1.0 as float), cast(2.0 as float));
|
|
---- TYPES
|
|
float
|
|
---- RESULTS
|
|
3.0
|
|
====
|
|
---- QUERY
|
|
select hive_add(cast(1.0 as double), cast(2.0 as double));
|
|
---- TYPES
|
|
double
|
|
---- RESULTS
|
|
3.0
|
|
====
|
|
---- QUERY
|
|
select hive_add(cast(1 as boolean), cast(0 as boolean));
|
|
---- TYPES
|
|
boolean
|
|
---- RESULTS
|
|
false
|
|
====
|
|
---- QUERY
|
|
# Testing whether all of persistent Java udfs are accessible.
|
|
select identity_anytype(true);
|
|
---- TYPES
|
|
boolean
|
|
---- RESULTS
|
|
true
|
|
====
|
|
---- QUERY
|
|
select identity_anytype(cast(10 as tinyint));
|
|
---- TYPES
|
|
tinyint
|
|
---- RESULTS
|
|
10
|
|
====
|
|
---- QUERY
|
|
select identity_anytype(cast(10 as smallint));
|
|
---- TYPES
|
|
smallint
|
|
---- RESULTS
|
|
10
|
|
====
|
|
---- QUERY
|
|
select identity_anytype(cast(10 as int));
|
|
---- TYPES
|
|
int
|
|
---- RESULTS
|
|
10
|
|
====
|
|
---- QUERY
|
|
select identity_anytype(cast(10 as bigint));
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
10
|
|
====
|
|
---- QUERY
|
|
select identity_anytype(cast(10.0 as float));
|
|
---- TYPES
|
|
float
|
|
---- RESULTS
|
|
10
|
|
====
|
|
---- QUERY
|
|
select identity_anytype(cast(10.0 as double));
|
|
---- TYPES
|
|
double
|
|
---- RESULTS
|
|
10
|
|
====
|
|
---- QUERY
|
|
select identity_anytype("a", "b");
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'ab'
|
|
====
|
|
---- QUERY
|
|
select identity_anytype("a", "b", "c");
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'abc'
|
|
====
|
|
---- QUERY
|
|
# BINARY is only supported when the function is created
|
|
# specifically with BINARY arguments / return type (IMPALA-11340).
|
|
select identity_anytype(cast("a" as binary));
|
|
---- CATCH
|
|
AnalysisException: No matching function with signature
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3378: test many Java UDFs being opened and run concurrently
|
|
select * from
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(bool_col) union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(tinyint_col) > 1 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(smallint_col) > 1 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(int_col) > 1 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(bigint_col) > 1 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(float_col) > 1.0 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(double_col) > 1.0 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(string_col) > '1' union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where not identity(bool_col) union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(tinyint_col) > 2 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(smallint_col) > 2 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(int_col) > 2 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(bigint_col) > 2 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(float_col) > 2.0 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(double_col) > 2.0 union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where identity(string_col) > '2'
|
|
)))))))))))))))) v
|
|
---- TYPES
|
|
INT
|
|
---- RESULTS
|
|
998
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
999
|
|
====
|
|
---- QUERY
|
|
drop table if exists replace_string_input
|
|
====
|
|
---- QUERY
|
|
create table replace_string_input as
|
|
values('toast'), ('scone'), ('stuff'), ('sssss'), ('yes'), ('scone'), ('stuff');
|
|
====
|
|
---- QUERY
|
|
# Regression test for IMPALA-4266: memory management bugs with output strings from
|
|
# Java UDFS, exposed by using the UDF as a grouping key in an aggregation.
|
|
# The UDF replaces "s" with "ss" in the strings.
|
|
select distinct replace_string(_c0) as es
|
|
from replace_string_input
|
|
order by 1;
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'sscone'
|
|
'ssssssssss'
|
|
'sstuff'
|
|
'toasst'
|
|
'yess'
|
|
====
|
|
---- QUERY
|
|
# Regression test for IMPALA-8016; this UDF loads another class in the same jar.
|
|
select import_nearby_classes("placeholder");
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'Hello'
|
|
====
|
|
---- QUERY
|
|
# In the interpreted code path of HiveUdfCall, we use ScalarExprEvaluator::GetValue which
|
|
# evaluates the child expression and returns a nullptr if it is NULL. But in the case of
|
|
# TYPE_DATE it also returns nullptr if the date is invalid, so the interpreted path would
|
|
# handle invalid dates as nulls. Java UDFs involving Date are not allowed yet, this test
|
|
# is a reminder that we'll have to handle this case in codegen when we add support for Date.
|
|
create function identity(Date) returns Date
|
|
location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
|
|
symbol='org.apache.impala.TestUdf';
|
|
---- CATCH
|
|
AnalysisException: Type DATE is not supported for Java UDFs.
|
|
====
|
|
---- QUERY
|
|
# Timestamp values are not supported in Java UDFs yet and the implementation may have been
|
|
# written without Timestamps in mind. This test is a reminder that we'll need to review
|
|
# and test the implementation of HiveUdfCall (especially codegen) with Timestamp values
|
|
# when we add support for them.
|
|
create function identity(Timestamp) returns Timestamp
|
|
location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
|
|
symbol='org.apache.impala.TestUdf';
|
|
---- CATCH
|
|
AnalysisException: Type TIMESTAMP is not supported for Java UDFs.
|
|
====
|
|
---- QUERY
|
|
create function var_args_func(int...) returns int
|
|
location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
|
|
symbol='org.apache.impala.TestUdf';
|
|
---- CATCH
|
|
CatalogException: Variable arguments not supported in Hive UDFs.
|
|
====
|