mirror of
https://github.com/apache/impala.git
synced 2026-01-25 18:01:04 -05:00
KUDU-1938 added VARCHAR column type support to Kudu. This commit adds support for Kudu's VARCHAR type to Impala. The length of a Kudu varchar is applied as a character length as opposed to a byte length like Impala currently uses. When writing data to Kudu, the VARCHAR length is not an issue because Impala only officially supports ASCII characters and those characters are the same size in bytes and characters. Additionally, extra bytes would be truncated by the Kudu client if somehow a value was too long. When reading data from Kudu, it is possible that the value written by some other application is wider in bytes than Impala expects and can handle. This can happen due to multi-byte UTF-8 characters. In that case, we adjust the length in Impala to truncate the extra bytes of the value. This isn’t a great solution, but one other integrations have taken as well given Impala doesn’t support UTF-8 values. IMPALA-5675 tracks adding UTF-8 Character length support to VARCHAR columns and marked the truncation code with a TODO that references that Jira. Testing: * Performed manual testing of standard DDL and DML interaction * Manually reproduced a check failure due to multi-byte characters and tested that length truncation resolve that issue. * Added/adjusted the following automated tests: ** AnalyzeDDLTest: CTAS into Kudu with varchar type ** AnalyzeKuduDDLTest: CREATE TABLE in Kudu with VARCHAR type ** kudu_create.test: Create table with VARCHAR column, key, hash partition, and range partition ** kudu_describe.test: Describe table with VARCHAR column and key ** kudu_insert.test: Insert with VARCHAR columns including null and non-null defaults ** kudu_update.test: Updates with VARCHAR column ** kudu_upsert.test: Upserts with VARCHAR column ** kudu_delete.test Deletes with VARCHAR columns ** kudu-scan-node.test Tests basic predicates with VARCHAR columns Follow on work: - IMPALA-9580: Add min-max runtime filter support/tests - IMPALA-9581: Pushdown string predicates - IMPALA-9583: Automated multibyte truncation tests Change-Id: I0d4959410fdd882bfa980cb55e8a7837c7823da8 Reviewed-on: http://gerrit.cloudera.org:8080/14197 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Thomas Tauber-Marshall <tmarshall@cloudera.com>
173 lines
4.9 KiB
Plaintext
173 lines
4.9 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Make sure LIMIT is enforced.
|
|
select * from functional_kudu.dimtbl order by id limit 1;
|
|
---- RESULTS
|
|
1001,'Name1',94611
|
|
---- TYPES
|
|
BIGINT, STRING, INT
|
|
====
|
|
---- QUERY
|
|
# Make sure that we can list the columns to be scanned in any order, that predicates
|
|
# work and that we can have predicates on columns not referenced elsewhere.
|
|
select zip, id from functional_kudu.dimtbl where id >= 1000 and 1002 >= id and
|
|
94611 = zip and 'Name1' = name order by id;
|
|
---- RESULTS
|
|
94611,1001
|
|
---- TYPES
|
|
INT, BIGINT
|
|
====
|
|
---- QUERY
|
|
# Regression test for IMPALA-2740, a NULL value from a previously filtered row would
|
|
# carry over into the next unfiltered row (the result below would incorrectly be 2,NULL).
|
|
CREATE TABLE impala_2740 (key INT PRIMARY KEY, value INT)
|
|
PARTITION BY HASH (key) PARTITIONS 3 STORED AS KUDU;
|
|
INSERT INTO impala_2740 VALUES (1, NULL), (2, -2);
|
|
SELECT * FROM impala_2740 WHERE key != 1;
|
|
---- RESULTS
|
|
2,-2
|
|
---- TYPES
|
|
INT, INT
|
|
====
|
|
---- QUERY
|
|
# Regression test for IMPALA-2635, the Kudu scanner hangs waiting for data from scanner
|
|
# threads that are never started. The union and both scans land in the same fragment which
|
|
# is run on all impalads. However, for the t1 table there is only as single scan range,
|
|
# so two of the scan instances get empty scan ranges.
|
|
CREATE TABLE impala_2635_t1 (id BIGINT PRIMARY KEY, name STRING)
|
|
PARTITION BY HASH (id) PARTITIONS 3 STORED AS KUDU;
|
|
CREATE TABLE impala_2635_t2 (id BIGINT PRIMARY KEY, name STRING)
|
|
PARTITION BY HASH(id) PARTITIONS 16 STORED AS KUDU;
|
|
INSERT INTO impala_2635_t1 VALUES (0, 'Foo');
|
|
INSERT INTO impala_2635_t2 VALUES (1, 'Blah');
|
|
SELECT * FROM impala_2635_t1 UNION ALL SELECT * FROM impala_2635_t2;
|
|
---- RESULTS
|
|
0,'Foo'
|
|
1,'Blah'
|
|
---- TYPES
|
|
BIGINT, STRING
|
|
====
|
|
---- QUERY
|
|
# IMPALA-4408: Test Kudu scans where all materialized slots are non-nullable.
|
|
select count(int_col) from functional_kudu.tinyinttable
|
|
---- RESULTS
|
|
10
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-4859: Test Kudu IS NULL/IS NOT NULL pushdown
|
|
select count(*) from functional_kudu.alltypesagg where id < 10 and float_col is null;
|
|
---- RESULTS
|
|
2
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_kudu.alltypesagg where id < 10 and float_col is not null;
|
|
---- RESULTS
|
|
9
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# alltypes.id is primary key/not nullable, verify IS NOT NULL/IS NULL pushdown works
|
|
select count(*) from functional_kudu.alltypes where id is not null;
|
|
---- RESULTS
|
|
7300
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_kudu.alltypes where id is null;
|
|
---- RESULTS
|
|
0
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# Push down TIMESTAMP binary predicates
|
|
select id, timestamp_col from functional_kudu.alltypes where
|
|
timestamp_col <= cast('2009-01-01 00:08:00.28' as timestamp) and
|
|
timestamp_col >= cast('2009-01-01 00:04:00.6' as timestamp)
|
|
order by id;
|
|
---- RESULTS
|
|
4,2009-01-01 00:04:00.600000000
|
|
5,2009-01-01 00:05:00.100000000
|
|
6,2009-01-01 00:06:00.150000000
|
|
7,2009-01-01 00:07:00.210000000
|
|
8,2009-01-01 00:08:00.280000000
|
|
---- TYPES
|
|
INT, TIMESTAMP
|
|
====
|
|
---- QUERY
|
|
# Out-of-range TIMESTAMP predicate (evaluates to NULL)
|
|
select id, timestamp_col from functional_kudu.alltypes where
|
|
timestamp_col > cast('1000-01-01 00:00:00.00' as timestamp)
|
|
---- RESULTS
|
|
---- TYPES
|
|
INT, TIMESTAMP
|
|
====
|
|
---- QUERY
|
|
select id, timestamp_col from functional_kudu.alltypes where
|
|
timestamp_col < cast('2009-01-01 00:08:00.28' as timestamp) and
|
|
timestamp_col > cast('2009-01-01 00:04:00.6' as timestamp)
|
|
order by id;
|
|
---- RESULTS
|
|
5,2009-01-01 00:05:00.100000000
|
|
6,2009-01-01 00:06:00.150000000
|
|
7,2009-01-01 00:07:00.210000000
|
|
---- TYPES
|
|
INT, TIMESTAMP
|
|
====
|
|
---- QUERY
|
|
select id, timestamp_col from functional_kudu.alltypes where
|
|
timestamp_col = cast('2009-01-01 00:08:00.28' as timestamp);
|
|
---- RESULTS
|
|
8,2009-01-01 00:08:00.280000000
|
|
---- TYPES
|
|
INT, TIMESTAMP
|
|
====
|
|
---- QUERY
|
|
# Push down TIMESTAMP IN list predicates
|
|
select id, timestamp_col from functional_kudu.alltypes where
|
|
timestamp_col in (cast('2010-03-01 00:00:00' as timestamp),
|
|
cast('2010-03-01 00:01:00' as timestamp))
|
|
order by id;
|
|
---- RESULTS
|
|
4240,2010-03-01 00:00:00
|
|
4241,2010-03-01 00:01:00
|
|
---- TYPES
|
|
INT, TIMESTAMP
|
|
====
|
|
---- QUERY
|
|
# Push down VARCHAR predicates
|
|
CREATE TABLE kudu_varchar_pred (key INT PRIMARY KEY, varchar_col VARCHAR(10))
|
|
PARTITION BY HASH (key) PARTITIONS 4 STORED AS KUDU;
|
|
INSERT INTO kudu_varchar_pred VALUES
|
|
(1, cast('a' as VARCHAR(10))),
|
|
(2, cast('b' as VARCHAR(10))),
|
|
(3, cast('m' as VARCHAR(10))),
|
|
(4, cast('y' as VARCHAR(10))),
|
|
(5, cast('z' as VARCHAR(10))),
|
|
(6, NULL);
|
|
select key, varchar_col from kudu_varchar_pred where
|
|
varchar_col >= cast('b' as VARCHAR(10)) and
|
|
varchar_col <= cast('y' as VARCHAR(10))
|
|
order by key;
|
|
---- RESULTS
|
|
2,'b'
|
|
3,'m'
|
|
4,'y'
|
|
---- TYPES
|
|
INT, STRING
|
|
====
|
|
---- QUERY
|
|
# Regression test for IMPALA-6187. Make sure count(*) queries with partition columns only
|
|
# won't miss conjuncts evaluation. 'id' is the partition column here.
|
|
select count(*) from functional_kudu.alltypes where rand() + id < 0.0;
|
|
---- RESULTS
|
|
0
|
|
---- TYPES
|
|
BIGINT
|
|
==== |