Files
impala/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
Joe McDonnell 11e66523d6 IMPALA-11526: Install en_US.UTF-8 locale into docker images
In IMPALA-11492, ExprTest.Utf8MaskTest was failing on some
configurations because the en_US.UTF-8 was missing. Since the
Docker images don't contain en_US.UTF-8, they are subject
to the same bug. This was confirmed by adding tests cases
to the test_utf8_strings.py end-to-end test and running it
in the dockerized tests.

This add the appropriate language pack to the list of packages
installed for the Docker build.

Testing:
 - This adds end-to-end tests to test_utf8_strings.py covering the
   same cases that were failing in ExprTest.Utf8MaskTest. They
   failed without the added languages packs, and now succeed.

Change-Id: I353f257b3cb6d45f7d0a28f7d5319fdb457e6e3d
Reviewed-on: http://gerrit.cloudera.org:8080/19080
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Reviewed-by: Laszlo Gaal <laszlo.gaal@cloudera.com>
2022-10-11 20:30:50 +00:00

313 lines
7.5 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
====
---- QUERY
set utf8_mode=true;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
2,7,11
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=false;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
6,11,19
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你好h'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=false;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select reverse('你好hello你好');
---- RESULTS: RAW_STRING
'好你olleh好你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=off;
select id, length(name), substring(name, 1, 3), length(substring(name, 1, 3)) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,6,'张',3
2,6,'李',3
3,6,'王',3
4,9,'李',3
5,5,'Ali',3
6,6,'陈',3
7,7,'Бo',3
8,5,'Jö',3
9,9,'ひ',3
10,6,'서',3
---- TYPES
INT,INT,STRING,INT
====
---- QUERY
set utf8_mode=true;
select id, length(name), substring(name, 1, 2), reverse(name) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,2,'张三','三张'
2,2,'李四','四李'
3,2,'王五','五王'
4,3,'李小','龙小李'
5,5,'Al','ecilA'
6,4,'陈B','boB陈'
7,5,'Бo','cиpoБ'
8,4,'Jö','gröJ'
9,3,'ひな','たなひ'
10,2,'서연','연서'
---- TYPES
INT,INT,STRING,STRING
====
---- QUERY
# Test utf8 functions in where clause.
set utf8_mode=true;
select id, name from functional.utf8_str_tiny
where length(name) = 2 and substring(name, 1, 1) = '李';
---- RESULTS: RAW_STRING
2,'李四'
---- TYPES
INT,STRING
====
---- QUERY
# Test utf8 functions in group by clause. group_concat() may produce undetermined results
# due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1);
---- RESULTS: RAW_STRING
'A',5
'ひ',3
'陈',4
'王',2
'张',2
'서',2
'J',4
'Б',5
'李',7
---- TYPES
STRING,INT
====
---- QUERY
# Test utf8 functions in group by and having clauses. group_concat() may produce
# undetermined results due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1)
having length(group_concat(name)) = 7;
---- RESULTS: RAW_STRING
'李',7
---- TYPES
STRING,INT
====
---- QUERY
# Each Chinese character is encoded into 3 bytes in UTF-8.
set utf8_mode=false;
select instr('最快的SQL引擎跑SQL', 'SQL'),
instr('最快的SQL引擎跑SQL', '引擎'),
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
instr('最快的SQL引擎跑SQL', '跑SQL'),
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
---- RESULTS
10,13,10,19,22,10
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select instr('最快的SQL引擎跑SQL', 'SQL'),
instr('最快的SQL引擎跑SQL', '引擎'),
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
instr('最快的SQL引擎跑SQL', '跑SQL'),
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
---- RESULTS
4,7,4,9,10,4
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
# Each Chinese character is encoded into 3 bytes in UTF-8.
set utf8_mode=false;
select locate('SQL', '最快的SQL引擎跑SQL'),
locate('引擎', '最快的SQL引擎跑SQL'),
locate('SQL引擎', '最快的SQL引擎跑SQL'),
locate('跑SQL', '最快的SQL引擎跑SQL'),
locate('SQL', '最快的SQL引擎跑SQL', 4),
locate('SQL', '最快的SQL引擎跑SQL', 11);
---- RESULTS
10,13,10,19,10,22
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select locate('SQL', '最快的SQL引擎跑SQL'),
locate('引擎', '最快的SQL引擎跑SQL'),
locate('SQL引擎', '最快的SQL引擎跑SQL'),
locate('跑SQL', '最快的SQL引擎跑SQL'),
locate('SQL', '最快的SQL引擎跑SQL', 4),
locate('SQL', '最快的SQL引擎跑SQL', 10);
---- RESULTS
4,7,4,9,4,10
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select mask('SQL引擎', 'x', 'x', 'x', 'x'),
mask_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_show_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_show_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x');
---- RESULTS: RAW_STRING
'xxxxx','SQLxx','SQxxx','xxL引擎','xxx引擎'
---- TYPES
STRING,STRING,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select mask('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'xxxx xxxx XXXX XXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('Ich möchte ein Bier. Tschüss');
---- RESULTS: RAW_STRING
'Xxx xxxxxx xxx Xxxx. Xxxxxxx'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('Hungarian áéíöóőüúű ÁÉÍÖÓŐÜÚŰ');
---- RESULTS: RAW_STRING
'Xxxxxxxxx xxxxxxxxx XXXXXXXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('German äöüß ÄÖÜẞ');
---- RESULTS: RAW_STRING
'Xxxxxx xxxx XXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('French àâæçéèêëïîôœùûüÿ ÀÂÆÇÉÈÊËÏÎÔŒÙÛÜŸ');
---- RESULTS: RAW_STRING
'Xxxxxx xxxxxxxxxxxxxxxx XXXXXXXXXXXXXXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('Greek αβξδ άέήώ ΑΒΞΔ ΆΈΉΏ 1234');
---- RESULTS: RAW_STRING
'Xxxxx xxxx xxxx XXXX XXXX nnnn'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_first_n('áéíöóőüúű');
---- RESULTS: RAW_STRING
'xxxxóőüúű'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_show_first_n('áéíöóőüúű');
---- RESULTS: RAW_STRING
'áéíöxxxxx'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_last_n('áéíöóőüúű');
---- RESULTS: RAW_STRING
'áéíöóxxxx'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_show_last_n('áéíöóőüúű')
---- RESULTS: RAW_STRING
'xxxxxőüúű'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=false;
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'ABCD áäèü','abcd ÁÄÈÜ','Abcd áäèü Abcd ÁÄÈÜ'
---- TYPES
STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'ABCD ÁÄÈÜ','abcd áäèü','Abcd Áäèü Abcd Áäèü'
---- TYPES
STRING,STRING,STRING
====
---- QUERY
set utf8_mode=false;
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三','张三'
2,'李四','李四','李四'
3,'王五','王五','王五'
4,'李小龙','李小龙','李小龙'
5,'ALICE','alice','Alice'
6,'陈BOB','陈bob','陈bob'
7,'БOPиC','Бopиc','Бopиc'
8,'JöRG','jörg','Jörg'
9,'ひなた','ひなた','ひなた'
10,'서연','서연','서연'
---- TYPES
INT,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三','张三'
2,'李四','李四','李四'
3,'王五','王五','王五'
4,'李小龙','李小龙','李小龙'
5,'ALICE','alice','Alice'
6,'陈BOB','陈bob','陈bob'
7,'БOPИC','бopиc','Бopиc'
8,'JÖRG','jörg','Jörg'
9,'ひなた','ひなた','ひなた'
10,'서연','서연','서연'
---- TYPES
INT,STRING,STRING,STRING
====