mirror of
https://github.com/apache/impala.git
synced 2026-01-20 03:03:01 -05:00
This patch adds support for BINARY columns for all table formats with the exception of Kudu. In Hive the main difference between STRING and BINARY is that STRING is assumed to be UTF8 encoded, while BINARY can be any byte array. Some other differences in Hive: - BINARY can be only cast from/to STRING - Only a small subset of built-in STRING functions support BINARY. - In several file formats (e.g. text) BINARY is base64 encoded. - No NDV is calculated during COMPUTE STATISTICS. As Impala doesn't treat STRINGs as UTF8, BINARY and STRING become nearly identical, especially from the backend's perspective. For this reason, BINARY is implemented a bit differently compared to other types: while the frontend treats STRING and BINARY as two separate types, most of the backend uses PrimitiveType::TYPE_STRING for BINARY too, e.g. in SlotDesc. Only the following parts of backend need to differentiate between STRING and BINARY: - table scanners - table writers - HS2/Beeswax service These parts have access to column metadata, which allows to add special handling for BINARY. Only a very few builtins are allowed for BINARY at the moment: - length - min/max/count - coalesce and similar "selector" functions Other STRING functions can be only used by casting to STRING first. Adding support for more of these functions is very easy, as simply the BINARY type has to be "connected" to the already existing STRING function's signature. Functions where the result depends on utf8_mode need to ensure that with BINARY it always works as if utf8_mode=0 (for example length() is mapped to bytes() as length count utf8 chars if utf8_mode=1). All kinds of UDFs (native, Hive legacy, Hive generic) support BINARY, though in case of legacy Hive UDFs it is only supported if the argument and return types are set explicitely to ensure backward compatibility. See IMPALA-11340 for details. The original plan was to behave as close to Hive as possible, but I realized that Hive has more relaxed casting rules than Impala, which led to STRING<->BINARY casts being necessary in more cases in Impala. This was needed to disallow passing a BINARY to functions that expect a STRING argument. An example for the difference is that in INSERT ... VALUES () string literals need to be explicitly cast to BINARY, while this is not needed in Hive. Testing: - Added functional.binary_tbl for all file formats (except Kudu) to test scanning. - Removed functional.unsupported_types and related tests, as now Impala supports all (non-complex) types that Hive does. - Added FE/EE tests mainly based on the ones added to the DATE type Change-Id: I36861a9ca6c2047b0d76862507c86f7f153bc582 Reviewed-on: http://gerrit.cloudera.org:8080/16066 Reviewed-by: Quanlong Huang <huangquanlong@gmail.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
249 lines
6.4 KiB
Plaintext
249 lines
6.4 KiB
Plaintext
====
|
|
---- QUERY
|
|
insert into table insertalltypesagg
|
|
select id, bigint_col, cast(string_col as binary), bool_col, date_string_col, day, double_col, float_col,
|
|
int_col, month, smallint_col, string_col, timestamp_col, tinyint_col, year from functional.alltypesagg
|
|
---- RESULTS
|
|
: 11000
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from insertalltypesagg
|
|
WHERE id > 300
|
|
ORDER BY id
|
|
LIMIT 2
|
|
---- RESULTS
|
|
301,false
|
|
302,true
|
|
---- TYPES
|
|
INT, BOOLEAN
|
|
====
|
|
---- QUERY
|
|
insert into table insertalltypesagg
|
|
select 9999999, bigint_col, cast(string_col as binary), false, date_string_col, day, double_col, float_col,
|
|
int_col, month, smallint_col, string_col, timestamp_col, tinyint_col, year from functional.alltypesagg
|
|
---- RESULTS
|
|
: 11000
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from insertalltypesagg
|
|
WHERE id = 9999999
|
|
ORDER BY id
|
|
LIMIT 2
|
|
---- RESULTS
|
|
9999999,false
|
|
---- TYPES
|
|
INT, BOOLEAN
|
|
====
|
|
---- QUERY
|
|
# test insert into ... select *
|
|
# using limit 1 to reduce execution time
|
|
insert into table insertalltypesagg
|
|
select * from insertalltypesagg limit 1
|
|
---- RESULTS
|
|
: 1
|
|
====
|
|
---- QUERY
|
|
# test inserting Hive's default text representation of NULL '\N'
|
|
# and make sure a scan returns the string and not NULL
|
|
insert into table insertalltypesagg
|
|
select 9999999, bigint_col, cast("\\N" as binary), false, "\\N", day, double_col, float_col,
|
|
int_col, month, smallint_col, "\\N", timestamp_col, tinyint_col, year from functional.alltypesagg limit 1
|
|
---- RESULTS
|
|
: 1
|
|
====
|
|
---- QUERY
|
|
select id, date_string_col, string_col, binary_col from insertalltypesagg
|
|
where id = 9999999
|
|
---- RESULTS
|
|
9999999,'\\N','\\N','\\N'
|
|
---- TYPES
|
|
INT, STRING, STRING, BINARY
|
|
====
|
|
---- QUERY
|
|
insert into table insertalltypesaggbinary
|
|
select id, bigint_col, cast(string_col as binary), bool_col, date_string_col, day, double_col, float_col,
|
|
int_col, month, smallint_col, string_col, timestamp_col, tinyint_col, year from functional.alltypesagg
|
|
---- RESULTS
|
|
: 11000
|
|
====
|
|
---- QUERY
|
|
select count(*) from (
|
|
select hb.* from insertalltypesaggbinary hb, functional.alltypesagg a
|
|
where hb.id = a.id
|
|
and (hb.bigint_col = a.bigint_col or
|
|
(hb.bigint_col is null and a.bigint_col is null))
|
|
and (hb.bool_col = a.bool_col or
|
|
(hb.bool_col is null and a.bool_col is null))
|
|
and (hb.date_string_col = a.date_string_col or
|
|
(hb.date_string_col is null and a.date_string_col is null))
|
|
and (hb.double_col = a.double_col or
|
|
(hb.double_col is null and a.double_col is null))
|
|
and (hb.float_col = a.float_col or
|
|
(hb.float_col is null and a.float_col is null))
|
|
and (hb.int_col = a.int_col or
|
|
(hb.int_col is null and a.int_col is null))
|
|
and (hb.smallint_col = a.smallint_col or
|
|
(hb.smallint_col is null and a.smallint_col is null))
|
|
and (hb.tinyint_col = a.tinyint_col or
|
|
(hb.tinyint_col is null and a.tinyint_col is null))
|
|
and (hb.string_col = a.string_col or
|
|
(hb.string_col is null and a.string_col is null))
|
|
and (hb.timestamp_col = a.timestamp_col or
|
|
(hb.timestamp_col is null and a.timestamp_col is null))
|
|
) x
|
|
---- RESULTS
|
|
11000
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from insertalltypesaggbinary
|
|
WHERE id > 300
|
|
ORDER BY id
|
|
LIMIT 2
|
|
---- RESULTS
|
|
301,false
|
|
302,true
|
|
---- TYPES
|
|
INT, BOOLEAN
|
|
====
|
|
---- QUERY
|
|
insert into table insertalltypesaggbinary
|
|
select 9999999, bigint_col, cast(string_col as binary), false, date_string_col, day, double_col, float_col,
|
|
int_col, month, smallint_col, string_col, timestamp_col, tinyint_col, year from functional.alltypesagg
|
|
---- RESULTS
|
|
: 11000
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from insertalltypesaggbinary
|
|
WHERE id = 9999999
|
|
ORDER BY id
|
|
LIMIT 2
|
|
---- RESULTS
|
|
9999999,false
|
|
---- TYPES
|
|
INT, BOOLEAN
|
|
====
|
|
---- QUERY
|
|
# test insert into ... select *
|
|
# using limit 1 to reduce execution time
|
|
insert into table insertalltypesaggbinary
|
|
select * from insertalltypesaggbinary limit 1
|
|
---- RESULTS
|
|
: 1
|
|
====
|
|
---- QUERY
|
|
# test inserting Hive's default text representation of NULL '\N'
|
|
# and make sure a scan returns the string and not NULL
|
|
insert into table insertalltypesaggbinary
|
|
select 9999999, bigint_col, cast("\\N" as binary), false, "\\N", day, double_col, float_col,
|
|
int_col, month, smallint_col, "\\N", timestamp_col, tinyint_col, year from functional.alltypesagg limit 1
|
|
---- RESULTS
|
|
: 1
|
|
====
|
|
---- QUERY
|
|
select id, date_string_col, string_col, binary_col from insertalltypesaggbinary
|
|
where id = 9999999
|
|
---- RESULTS
|
|
9999999,'\\N','\\N','\\N'
|
|
---- TYPES
|
|
INT, STRING, STRING, BINARY
|
|
====
|
|
---- QUERY
|
|
#IMPALA-715 handle large string value
|
|
insert into table insertalltypesagg(id, string_col) values(9999999, rpad('a', 50000, 'b'))
|
|
---- RESULTS
|
|
: 1
|
|
====
|
|
---- QUERY
|
|
select id, length(string_col) from insertalltypesagg
|
|
WHERE id = 9999999
|
|
---- RESULTS
|
|
9999999,50000
|
|
---- TYPES
|
|
INT, INT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-2133
|
|
insert into table insertalltypesagg (id, string_col) values (99999999, 'William\'s'), (999999999, "Other\"s")
|
|
---- RESULTS
|
|
: 2
|
|
====
|
|
---- QUERY
|
|
select id, string_col from insertalltypesagg where id = 99999999
|
|
---- RESULTS
|
|
99999999,'William''s'
|
|
---- TYPES
|
|
INT, STRING
|
|
====
|
|
---- QUERY
|
|
select id, string_col from insertalltypesagg where string_col = 'William\'s'
|
|
---- RESULTS
|
|
99999999,'William''s'
|
|
---- TYPES
|
|
INT, STRING
|
|
====
|
|
---- QUERY
|
|
select id, string_col from insertalltypesagg where string_col = "Other\"s"
|
|
---- RESULTS
|
|
999999999,'Other"s'
|
|
---- TYPES
|
|
INT, STRING
|
|
====
|
|
---- QUERY
|
|
insert into table insert_date_tbl
|
|
select id_col, date_col, date_part
|
|
from functional.date_tbl
|
|
---- RESULTS
|
|
: 22
|
|
====
|
|
---- QUERY
|
|
select id_col, date_col from insert_date_tbl
|
|
WHERE id_col > 20
|
|
ORDER BY id_col
|
|
LIMIT 2
|
|
---- RESULTS
|
|
21,0001-06-22
|
|
22,0001-06-23
|
|
---- TYPES
|
|
INT, DATE
|
|
====
|
|
---- QUERY
|
|
insert into table insert_date_tbl
|
|
select 9999999, date_col, '1521-12-13'
|
|
from functional.date_tbl
|
|
---- RESULTS
|
|
: 22
|
|
====
|
|
---- QUERY
|
|
select id_col, date_part from insert_date_tbl
|
|
WHERE id_col = 9999999
|
|
ORDER BY id_col
|
|
LIMIT 2
|
|
---- RESULTS
|
|
9999999,1521-12-13
|
|
---- TYPES
|
|
INT, DATE
|
|
====
|
|
---- QUERY
|
|
# test insert into ... select *
|
|
# using limit 1 to reduce execution time
|
|
insert into table insert_date_tbl
|
|
select * from insert_date_tbl limit 1
|
|
---- RESULTS
|
|
: 1
|
|
====
|
|
---- QUERY
|
|
# Insert special characters to binary_col.
|
|
insert into table insertalltypesagg (id, binary_col) values (99999999, cast(unhex('00112233445566778899AABBCCDDEEFF') as binary))
|
|
---- RESULTS
|
|
: 1
|
|
====
|
|
---- QUERY
|
|
select id, hex(cast(binary_col as string)) from insertalltypesagg where id = 99999999
|
|
---- RESULTS
|
|
99999999,'00112233445566778899AABBCCDDEEFF'
|
|
---- TYPES
|
|
INT, STRING
|
|
====
|