Files
impala/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
Eyizoha e489ab35b1 IMPALA-12718: Provides UTF-8 support for the trim functions
Currently, the trim function (including BTRIM, LTRIM, RTRIM) cannot
correctly handle strings containing multi-byte UTF-8 characters.
Multi-byte UTF-8 characters are interpreted as multiple single-byte
characters, leading to unexpected results.

This patch provides UTF-8 support for the trim functions, enabling these
functions to correctly handle multi-byte UTF-8 characters (when set
utf8_mode=true). It also introduces a set of trim functions with the
'utf8_' prefix, offering the same capability even when utf8_mode is not
enabled.

Testing:
 - Added new BE test case in ExprTest#Utf8Test
 - Added new E2E test case in TestUtf8StringFunctions

Change-Id: I5cfaffd71009f16eae75910af835bd2a34410856
Reviewed-on: http://gerrit.cloudera.org:8080/20926
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2024-02-02 11:06:24 +00:00

398 lines
9.2 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
====
---- QUERY
set utf8_mode=true;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
2,7,11
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=false;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
6,11,19
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你好h'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=false;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select reverse('你好hello你好');
---- RESULTS: RAW_STRING
'好你olleh好你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=off;
select id, length(name), substring(name, 1, 3), length(substring(name, 1, 3)) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,6,'张',3
2,6,'李',3
3,6,'王',3
4,9,'李',3
5,5,'Ali',3
6,6,'陈',3
7,7,'Бo',3
8,5,'Jö',3
9,9,'ひ',3
10,6,'서',3
---- TYPES
INT,INT,STRING,INT
====
---- QUERY
set utf8_mode=true;
select id, length(name), substring(name, 1, 2), reverse(name) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,2,'张三','三张'
2,2,'李四','四李'
3,2,'王五','五王'
4,3,'李小','龙小李'
5,5,'Al','ecilA'
6,4,'陈B','boB陈'
7,5,'Бo','cиpoБ'
8,4,'Jö','gröJ'
9,3,'ひな','たなひ'
10,2,'서연','연서'
---- TYPES
INT,INT,STRING,STRING
====
---- QUERY
# Test utf8 functions in where clause.
set utf8_mode=true;
select id, name from functional.utf8_str_tiny
where length(name) = 2 and substring(name, 1, 1) = '李';
---- RESULTS: RAW_STRING
2,'李四'
---- TYPES
INT,STRING
====
---- QUERY
# Test utf8 functions in group by clause. group_concat() may produce undetermined results
# due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1);
---- RESULTS: RAW_STRING
'A',5
'ひ',3
'陈',4
'王',2
'张',2
'서',2
'J',4
'Б',5
'李',7
---- TYPES
STRING,INT
====
---- QUERY
# Test utf8 functions in group by and having clauses. group_concat() may produce
# undetermined results due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1)
having length(group_concat(name)) = 7;
---- RESULTS: RAW_STRING
'李',7
---- TYPES
STRING,INT
====
---- QUERY
# Each Chinese character is encoded into 3 bytes in UTF-8.
set utf8_mode=false;
select instr('最快的SQL引擎跑SQL', 'SQL'),
instr('最快的SQL引擎跑SQL', '引擎'),
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
instr('最快的SQL引擎跑SQL', '跑SQL'),
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
---- RESULTS
10,13,10,19,22,10
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select instr('最快的SQL引擎跑SQL', 'SQL'),
instr('最快的SQL引擎跑SQL', '引擎'),
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
instr('最快的SQL引擎跑SQL', '跑SQL'),
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
---- RESULTS
4,7,4,9,10,4
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
# Each Chinese character is encoded into 3 bytes in UTF-8.
set utf8_mode=false;
select locate('SQL', '最快的SQL引擎跑SQL'),
locate('引擎', '最快的SQL引擎跑SQL'),
locate('SQL引擎', '最快的SQL引擎跑SQL'),
locate('跑SQL', '最快的SQL引擎跑SQL'),
locate('SQL', '最快的SQL引擎跑SQL', 4),
locate('SQL', '最快的SQL引擎跑SQL', 11);
---- RESULTS
10,13,10,19,10,22
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select locate('SQL', '最快的SQL引擎跑SQL'),
locate('引擎', '最快的SQL引擎跑SQL'),
locate('SQL引擎', '最快的SQL引擎跑SQL'),
locate('跑SQL', '最快的SQL引擎跑SQL'),
locate('SQL', '最快的SQL引擎跑SQL', 4),
locate('SQL', '最快的SQL引擎跑SQL', 10);
---- RESULTS
4,7,4,9,4,10
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select mask('SQL引擎', 'x', 'x', 'x', 'x'),
mask_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_show_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_show_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x');
---- RESULTS: RAW_STRING
'xxxxx','SQLxx','SQxxx','xxL引擎','xxx引擎'
---- TYPES
STRING,STRING,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select mask('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'xxxx xxxx XXXX XXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('Ich möchte ein Bier. Tschüss');
---- RESULTS: RAW_STRING
'Xxx xxxxxx xxx Xxxx. Xxxxxxx'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('Hungarian áéíöóőüúű ÁÉÍÖÓŐÜÚŰ');
---- RESULTS: RAW_STRING
'Xxxxxxxxx xxxxxxxxx XXXXXXXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('German äöüß ÄÖÜẞ');
---- RESULTS: RAW_STRING
'Xxxxxx xxxx XXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('French àâæçéèêëïîôœùûüÿ ÀÂÆÇÉÈÊËÏÎÔŒÙÛÜŸ');
---- RESULTS: RAW_STRING
'Xxxxxx xxxxxxxxxxxxxxxx XXXXXXXXXXXXXXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('Greek αβξδ άέήώ ΑΒΞΔ ΆΈΉΏ 1234');
---- RESULTS: RAW_STRING
'Xxxxx xxxx xxxx XXXX XXXX nnnn'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_first_n('áéíöóőüúű');
---- RESULTS: RAW_STRING
'xxxxóőüúű'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_show_first_n('áéíöóőüúű');
---- RESULTS: RAW_STRING
'áéíöxxxxx'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_last_n('áéíöóőüúű');
---- RESULTS: RAW_STRING
'áéíöóxxxx'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_show_last_n('áéíöóőüúű')
---- RESULTS: RAW_STRING
'xxxxxőüúű'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=false;
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'ABCD áäèü','abcd ÁÄÈÜ','Abcd áäèü Abcd ÁÄÈÜ'
---- TYPES
STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'ABCD ÁÄÈÜ','abcd áäèü','Abcd Áäèü Abcd Áäèü'
---- TYPES
STRING,STRING,STRING
====
---- QUERY
set utf8_mode=false;
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三','张三'
2,'李四','李四','李四'
3,'王五','王五','王五'
4,'李小龙','李小龙','李小龙'
5,'ALICE','alice','Alice'
6,'陈BOB','陈bob','陈bob'
7,'БOPиC','Бopиc','Бopиc'
8,'JöRG','jörg','Jörg'
9,'ひなた','ひなた','ひなた'
10,'서연','서연','서연'
---- TYPES
INT,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三','张三'
2,'李四','李四','李四'
3,'王五','王五','王五'
4,'李小龙','李小龙','李小龙'
5,'ALICE','alice','Alice'
6,'陈BOB','陈bob','陈bob'
7,'БOPИC','бopиc','Бopиc'
8,'JÖRG','jörg','Jörg'
9,'ひなた','ひなた','ひなた'
10,'서연','서연','서연'
---- TYPES
INT,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, ltrim(name, substr(name, 1, 1)) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','三'
2,'李四','四'
3,'王五','五'
4,'李小龙','小龙'
5,'Alice','lice'
6,'陈Bob','Bob'
7,'Бopиc','opиc'
8,'Jörg','örg'
9,'ひなた','なた'
10,'서연','연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, rtrim(name, substr(name, 2)) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张'
2,'李四','李'
3,'王五','王'
4,'李小龙','李'
5,'Alice','A'
6,'陈Bob','陈'
7,'Бopиc','Б'
8,'Jörg','J'
9,'ひなた','ひ'
10,'서연','서'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, ltrim(name, '张李王小A陈БJひ서') from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','三'
2,'李四','四'
3,'王五','五'
4,'李小龙','龙'
5,'Alice','lice'
6,'陈Bob','Bob'
7,'Бopиc','opиc'
8,'Jörg','örg'
9,'ひなた','なた'
10,'서연','연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, rtrim(name, '三四五小龙') from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张'
2,'李四','李'
3,'王五','王'
4,'李小龙','李'
5,'Alice','Alice'
6,'陈Bob','陈Bob'
7,'Бopиc','Бopиc'
8,'Jörg','Jörg'
9,'ひなた','ひなた'
10,'서연','서연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, btrim(name, '!?。,:;‘’“”≠≥≤∞ε∑∫√') from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三'
2,'李四','李四'
3,'王五','王五'
4,'李小龙','李小龙'
5,'Alice','Alice'
6,'陈Bob','陈Bob'
7,'Бopиc','Бopиc'
8,'Jörg','Jörg'
9,'ひなた','ひなた'
10,'서연','서연'
---- TYPES
INT,STRING,STRING
====