Files
impala/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
stiga-huang e8720b40f1 IMPALA-2019(Part-1): Provide UTF-8 support in length, substring and reverse functions
A unicode character can be encoded into 1-4 bytes in UTF-8. String
functions will return undesired results when the input contains unicode
characters, because we deal with a string as a byte array. For instance,
length() returns the length in bytes, not in unicode characters.

UTF-8 is the dominant unicode encoding used in the Hadoop ecosystem.
This patch adds UTF-8 support in some string functions so they can have
UTF-8 aware behavior. For compatibility with the old versions, a new
query option, UTF8_MODE, is added for turning on/off the UTF-8 aware
behavior. Currently, only length(), substring() and reverse() support
it. Other function supports will be added in later patches.

String functions will check the query option and switch to use the
desired implementation. It's similar to how we use the decimal_v2 query
option in builtin functions.

For easy testing, the UTF-8 aware version of string functions are
also exposed as builtin functions (named by utf8_*, e.g. utf8_length).

Tests:
 - Add BE tests for utf8 functions.
 - Add e2e tests for the UTF8_MODE query option.

Change-Id: I0aaf3544e89f8a3d531ad6afe056b3658b525b7c
Reviewed-on: http://gerrit.cloudera.org:8080/16908
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2021-01-26 00:43:39 +00:00

117 lines
2.4 KiB
Plaintext

====
---- QUERY
set utf8_mode=true;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
2,7,11
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=false;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
6,11,19
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你好h'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=false;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select reverse('你好hello你好');
---- RESULTS: RAW_STRING
'好你olleh好你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=off;
select id, length(name), substring(name, 1, 3), length(substring(name, 1, 3)) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,6,'张',3
2,6,'李',3
3,6,'王',3
4,9,'李',3
5,5,'Ali',3
6,6,'陈',3
7,7,'Бo',3
8,5,'Jö',3
9,9,'ひ',3
10,6,'서',3
---- TYPES
INT,INT,STRING,INT
====
---- QUERY
set utf8_mode=true;
select id, length(name), substring(name, 1, 2), reverse(name) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,2,'张三','三张'
2,2,'李四','四李'
3,2,'王五','五王'
4,3,'李小','龙小李'
5,5,'Al','ecilA'
6,4,'陈B','boB陈'
7,5,'Бo','cиpoБ'
8,4,'Jö','gröJ'
9,3,'ひな','たなひ'
10,2,'서연','연서'
---- TYPES
INT,INT,STRING,STRING
====
---- QUERY
# Test utf8 functions in where clause.
set utf8_mode=true;
select id, name from functional.utf8_str_tiny
where length(name) = 2 and substring(name, 1, 1) = '李';
---- RESULTS: RAW_STRING
2,'李四'
---- TYPES
INT,STRING
====
---- QUERY
# Test utf8 functions in group by clause. group_concat() may produce undetermined results
# due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1);
---- RESULTS: RAW_STRING
'A',5
'ひ',3
'陈',4
'王',2
'张',2
'서',2
'J',4
'Б',5
'李',7
---- TYPES
STRING,INT
====
---- QUERY
# Test utf8 functions in group by and having clauses. group_concat() may produce
# undetermined results due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1)
having length(group_concat(name)) = 7;
---- RESULTS: RAW_STRING
'李',7
---- TYPES
STRING,INT
====