mirror of
https://github.com/apache/impala.git
synced 2026-01-25 09:01:08 -05:00
Currently, the trim function (including BTRIM, LTRIM, RTRIM) cannot correctly handle strings containing multi-byte UTF-8 characters. Multi-byte UTF-8 characters are interpreted as multiple single-byte characters, leading to unexpected results. This patch provides UTF-8 support for the trim functions, enabling these functions to correctly handle multi-byte UTF-8 characters (when set utf8_mode=true). It also introduces a set of trim functions with the 'utf8_' prefix, offering the same capability even when utf8_mode is not enabled. Testing: - Added new BE test case in ExprTest#Utf8Test - Added new E2E test case in TestUtf8StringFunctions Change-Id: I5cfaffd71009f16eae75910af835bd2a34410856 Reviewed-on: http://gerrit.cloudera.org:8080/20926 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
398 lines
9.2 KiB
Plaintext
398 lines
9.2 KiB
Plaintext
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select length('你好'), length('你好hello'), length('你好 hello 你好')
|
||
---- RESULTS
|
||
2,7,11
|
||
---- TYPES
|
||
INT,INT,INT
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=false;
|
||
select length('你好'), length('你好hello'), length('你好 hello 你好')
|
||
---- RESULTS
|
||
6,11,19
|
||
---- TYPES
|
||
INT,INT,INT
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select substring('你好hello', 1, 3)
|
||
---- RESULTS: RAW_STRING
|
||
'你好h'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=false;
|
||
select substring('你好hello', 1, 3)
|
||
---- RESULTS: RAW_STRING
|
||
'你'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select reverse('你好hello你好');
|
||
---- RESULTS: RAW_STRING
|
||
'好你olleh好你'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=off;
|
||
select id, length(name), substring(name, 1, 3), length(substring(name, 1, 3)) from utf8_str_tiny
|
||
---- RESULTS: RAW_STRING
|
||
1,6,'张',3
|
||
2,6,'李',3
|
||
3,6,'王',3
|
||
4,9,'李',3
|
||
5,5,'Ali',3
|
||
6,6,'陈',3
|
||
7,7,'Бo',3
|
||
8,5,'Jö',3
|
||
9,9,'ひ',3
|
||
10,6,'서',3
|
||
---- TYPES
|
||
INT,INT,STRING,INT
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select id, length(name), substring(name, 1, 2), reverse(name) from utf8_str_tiny
|
||
---- RESULTS: RAW_STRING
|
||
1,2,'张三','三张'
|
||
2,2,'李四','四李'
|
||
3,2,'王五','五王'
|
||
4,3,'李小','龙小李'
|
||
5,5,'Al','ecilA'
|
||
6,4,'陈B','boB陈'
|
||
7,5,'Бo','cиpoБ'
|
||
8,4,'Jö','gröJ'
|
||
9,3,'ひな','たなひ'
|
||
10,2,'서연','연서'
|
||
---- TYPES
|
||
INT,INT,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
# Test utf8 functions in where clause.
|
||
set utf8_mode=true;
|
||
select id, name from functional.utf8_str_tiny
|
||
where length(name) = 2 and substring(name, 1, 1) = '李';
|
||
---- RESULTS: RAW_STRING
|
||
2,'李四'
|
||
---- TYPES
|
||
INT,STRING
|
||
====
|
||
---- QUERY
|
||
# Test utf8 functions in group by clause. group_concat() may produce undetermined results
|
||
# due to the order. Here we wrap it with length().
|
||
set utf8_mode=true;
|
||
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
|
||
group by substring(name, 1, 1);
|
||
---- RESULTS: RAW_STRING
|
||
'A',5
|
||
'ひ',3
|
||
'陈',4
|
||
'王',2
|
||
'张',2
|
||
'서',2
|
||
'J',4
|
||
'Б',5
|
||
'李',7
|
||
---- TYPES
|
||
STRING,INT
|
||
====
|
||
---- QUERY
|
||
# Test utf8 functions in group by and having clauses. group_concat() may produce
|
||
# undetermined results due to the order. Here we wrap it with length().
|
||
set utf8_mode=true;
|
||
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
|
||
group by substring(name, 1, 1)
|
||
having length(group_concat(name)) = 7;
|
||
---- RESULTS: RAW_STRING
|
||
'李',7
|
||
---- TYPES
|
||
STRING,INT
|
||
====
|
||
---- QUERY
|
||
# Each Chinese character is encoded into 3 bytes in UTF-8.
|
||
set utf8_mode=false;
|
||
select instr('最快的SQL引擎跑SQL', 'SQL'),
|
||
instr('最快的SQL引擎跑SQL', '引擎'),
|
||
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
|
||
instr('最快的SQL引擎跑SQL', '跑SQL'),
|
||
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
|
||
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
|
||
---- RESULTS
|
||
10,13,10,19,22,10
|
||
---- TYPES
|
||
INT,INT,INT,INT,INT,INT
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select instr('最快的SQL引擎跑SQL', 'SQL'),
|
||
instr('最快的SQL引擎跑SQL', '引擎'),
|
||
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
|
||
instr('最快的SQL引擎跑SQL', '跑SQL'),
|
||
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
|
||
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
|
||
---- RESULTS
|
||
4,7,4,9,10,4
|
||
---- TYPES
|
||
INT,INT,INT,INT,INT,INT
|
||
====
|
||
---- QUERY
|
||
# Each Chinese character is encoded into 3 bytes in UTF-8.
|
||
set utf8_mode=false;
|
||
select locate('SQL', '最快的SQL引擎跑SQL'),
|
||
locate('引擎', '最快的SQL引擎跑SQL'),
|
||
locate('SQL引擎', '最快的SQL引擎跑SQL'),
|
||
locate('跑SQL', '最快的SQL引擎跑SQL'),
|
||
locate('SQL', '最快的SQL引擎跑SQL', 4),
|
||
locate('SQL', '最快的SQL引擎跑SQL', 11);
|
||
---- RESULTS
|
||
10,13,10,19,10,22
|
||
---- TYPES
|
||
INT,INT,INT,INT,INT,INT
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select locate('SQL', '最快的SQL引擎跑SQL'),
|
||
locate('引擎', '最快的SQL引擎跑SQL'),
|
||
locate('SQL引擎', '最快的SQL引擎跑SQL'),
|
||
locate('跑SQL', '最快的SQL引擎跑SQL'),
|
||
locate('SQL', '最快的SQL引擎跑SQL', 4),
|
||
locate('SQL', '最快的SQL引擎跑SQL', 10);
|
||
---- RESULTS
|
||
4,7,4,9,4,10
|
||
---- TYPES
|
||
INT,INT,INT,INT,INT,INT
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask('SQL引擎', 'x', 'x', 'x', 'x'),
|
||
mask_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
|
||
mask_show_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
|
||
mask_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
|
||
mask_show_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x');
|
||
---- RESULTS: RAW_STRING
|
||
'xxxxx','SQLxx','SQxxx','xxL引擎','xxx引擎'
|
||
---- TYPES
|
||
STRING,STRING,STRING,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask('abcd áäèü ABCD ÁÄÈÜ');
|
||
---- RESULTS: RAW_STRING
|
||
'xxxx xxxx XXXX XXXX'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask('Ich möchte ein Bier. Tschüss');
|
||
---- RESULTS: RAW_STRING
|
||
'Xxx xxxxxx xxx Xxxx. Xxxxxxx'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask('Hungarian áéíöóőüúű ÁÉÍÖÓŐÜÚŰ');
|
||
---- RESULTS: RAW_STRING
|
||
'Xxxxxxxxx xxxxxxxxx XXXXXXXXX'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask('German äöüß ÄÖÜẞ');
|
||
---- RESULTS: RAW_STRING
|
||
'Xxxxxx xxxx XXXX'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask('French àâæçéèêëïîôœùûüÿ ÀÂÆÇÉÈÊËÏÎÔŒÙÛÜŸ');
|
||
---- RESULTS: RAW_STRING
|
||
'Xxxxxx xxxxxxxxxxxxxxxx XXXXXXXXXXXXXXXX'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask('Greek αβξδ άέήώ ΑΒΞΔ ΆΈΉΏ 1234');
|
||
---- RESULTS: RAW_STRING
|
||
'Xxxxx xxxx xxxx XXXX XXXX nnnn'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask_first_n('áéíöóőüúű');
|
||
---- RESULTS: RAW_STRING
|
||
'xxxxóőüúű'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask_show_first_n('áéíöóőüúű');
|
||
---- RESULTS: RAW_STRING
|
||
'áéíöxxxxx'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask_last_n('áéíöóőüúű');
|
||
---- RESULTS: RAW_STRING
|
||
'áéíöóxxxx'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select mask_show_last_n('áéíöóőüúű')
|
||
---- RESULTS: RAW_STRING
|
||
'xxxxxőüúű'
|
||
---- TYPES
|
||
STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=false;
|
||
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
|
||
---- RESULTS: RAW_STRING
|
||
'ABCD áäèü','abcd ÁÄÈÜ','Abcd áäèü Abcd ÁÄÈÜ'
|
||
---- TYPES
|
||
STRING,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
|
||
---- RESULTS: RAW_STRING
|
||
'ABCD ÁÄÈÜ','abcd áäèü','Abcd Áäèü Abcd Áäèü'
|
||
---- TYPES
|
||
STRING,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=false;
|
||
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
|
||
---- RESULTS: RAW_STRING
|
||
1,'张三','张三','张三'
|
||
2,'李四','李四','李四'
|
||
3,'王五','王五','王五'
|
||
4,'李小龙','李小龙','李小龙'
|
||
5,'ALICE','alice','Alice'
|
||
6,'陈BOB','陈bob','陈bob'
|
||
7,'БOPиC','Бopиc','Бopиc'
|
||
8,'JöRG','jörg','Jörg'
|
||
9,'ひなた','ひなた','ひなた'
|
||
10,'서연','서연','서연'
|
||
---- TYPES
|
||
INT,STRING,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
|
||
---- RESULTS: RAW_STRING
|
||
1,'张三','张三','张三'
|
||
2,'李四','李四','李四'
|
||
3,'王五','王五','王五'
|
||
4,'李小龙','李小龙','李小龙'
|
||
5,'ALICE','alice','Alice'
|
||
6,'陈BOB','陈bob','陈bob'
|
||
7,'БOPИC','бopиc','Бopиc'
|
||
8,'JÖRG','jörg','Jörg'
|
||
9,'ひなた','ひなた','ひなた'
|
||
10,'서연','서연','서연'
|
||
---- TYPES
|
||
INT,STRING,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select id, name, ltrim(name, substr(name, 1, 1)) from utf8_str_tiny;
|
||
---- RESULTS: RAW_STRING
|
||
1,'张三','三'
|
||
2,'李四','四'
|
||
3,'王五','五'
|
||
4,'李小龙','小龙'
|
||
5,'Alice','lice'
|
||
6,'陈Bob','Bob'
|
||
7,'Бopиc','opиc'
|
||
8,'Jörg','örg'
|
||
9,'ひなた','なた'
|
||
10,'서연','연'
|
||
---- TYPES
|
||
INT,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select id, name, rtrim(name, substr(name, 2)) from utf8_str_tiny;
|
||
---- RESULTS: RAW_STRING
|
||
1,'张三','张'
|
||
2,'李四','李'
|
||
3,'王五','王'
|
||
4,'李小龙','李'
|
||
5,'Alice','A'
|
||
6,'陈Bob','陈'
|
||
7,'Бopиc','Б'
|
||
8,'Jörg','J'
|
||
9,'ひなた','ひ'
|
||
10,'서연','서'
|
||
---- TYPES
|
||
INT,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select id, name, ltrim(name, '张李王小A陈БJひ서') from utf8_str_tiny;
|
||
---- RESULTS: RAW_STRING
|
||
1,'张三','三'
|
||
2,'李四','四'
|
||
3,'王五','五'
|
||
4,'李小龙','龙'
|
||
5,'Alice','lice'
|
||
6,'陈Bob','Bob'
|
||
7,'Бopиc','opиc'
|
||
8,'Jörg','örg'
|
||
9,'ひなた','なた'
|
||
10,'서연','연'
|
||
---- TYPES
|
||
INT,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select id, name, rtrim(name, '三四五小龙') from utf8_str_tiny;
|
||
---- RESULTS: RAW_STRING
|
||
1,'张三','张'
|
||
2,'李四','李'
|
||
3,'王五','王'
|
||
4,'李小龙','李'
|
||
5,'Alice','Alice'
|
||
6,'陈Bob','陈Bob'
|
||
7,'Бopиc','Бopиc'
|
||
8,'Jörg','Jörg'
|
||
9,'ひなた','ひなた'
|
||
10,'서연','서연'
|
||
---- TYPES
|
||
INT,STRING,STRING
|
||
====
|
||
---- QUERY
|
||
set utf8_mode=true;
|
||
select id, name, btrim(name, '!?。,:;‘’“”≠≥≤∞ε∑∫√') from utf8_str_tiny;
|
||
---- RESULTS: RAW_STRING
|
||
1,'张三','张三'
|
||
2,'李四','李四'
|
||
3,'王五','王五'
|
||
4,'李小龙','李小龙'
|
||
5,'Alice','Alice'
|
||
6,'陈Bob','陈Bob'
|
||
7,'Бopиc','Бopиc'
|
||
8,'Jörg','Jörg'
|
||
9,'ひなた','ひなた'
|
||
10,'서연','서연'
|
||
---- TYPES
|
||
INT,STRING,STRING
|
||
====
|