Files
impala/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
stiga-huang 35375b3287 IMPALA-2019(part-4): Add UTF-8 support for case conversion functions
There are 3 builtin case conversion string functions: upper(), lower(),
and initcap(). Previously they only convert English alphabetic
characters. This patch adds support to deal with Unicode characters.

There are many corner cases in case conversion depending on the locale
and context. E.g.
1) Case conversion is locale-sensitive.
Turkish has 4 letter "I"s. English has only two, a lowercase dotted i
and an uppercase dotless I. Turkish has lowercase and uppercase forms of
both dotted and dotless I. So simply converting "i" to "I" for upper
case is wrong in Turkish:
    +-------+--------+---------+
    |       | Dotted | Dotless |
    +-------+--------+---------+
    | Upper | İ      | I       |
    +-------+--------+---------+
    | Lower | i      | ı       |
    +-------+--------+---------+

2) Case conversion may change a string's length.
The German word "grüßen" should be converted to "GRÜSSEN" in upper case:
the letter "ß" should be converted to "SS".

3) Case conversion is context-sensitive.
The Greek word "ὈΔΥΣΣΕΎΣ" should be converted to "ὀδυσσεύς", where the
Greek letter "Σ" is converted to "σ" or to "ς", depending on its
position in the word.

The above cases will be focus in follow-up JIRAs. This patch addes the
initial implementation of UTF-8 aware case conversion functions.

--------
Implementation:
In UTF-8 mode (turned on by set UTF8_MODE=true) of these functions, the
bytes in strings are converted to wide characters using std::mbrtowc().
Each wide character (wchar_t) will then be converted using std::towupper
or std::towlower correspondingly. We then convert them back to multi
bytes using std::wcrtomb().

Note that these builtins are locale aware. If impalad is launched
without a UTF-8 aware locale, e.g. LC_ALL="C", these builtins can't
recognize non-ascii characters, which will return unexpected results.
Thus we modify our docker images to set LC_ALL="C.UTF-8" instead of "C".
This patch also logs the current locale when launching impala daemons
for better debugging. We will support customized locale in IMPALA-11080.

Test:
 - Add BE unit tests and e2e tests.

Change-Id: I443e89d46f4638ce85664b021666bc4f03ee8abd
Reviewed-on: http://gerrit.cloudera.org:8080/17785
Reviewed-by: Csaba Ringhofer <csringhofer@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2022-02-15 18:40:59 +00:00

233 lines
6.0 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
====
---- QUERY
set utf8_mode=true;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
2,7,11
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=false;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
6,11,19
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你好h'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=false;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select reverse('你好hello你好');
---- RESULTS: RAW_STRING
'好你olleh好你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=off;
select id, length(name), substring(name, 1, 3), length(substring(name, 1, 3)) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,6,'张',3
2,6,'李',3
3,6,'王',3
4,9,'李',3
5,5,'Ali',3
6,6,'陈',3
7,7,'Бo',3
8,5,'Jö',3
9,9,'ひ',3
10,6,'서',3
---- TYPES
INT,INT,STRING,INT
====
---- QUERY
set utf8_mode=true;
select id, length(name), substring(name, 1, 2), reverse(name) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,2,'张三','三张'
2,2,'李四','四李'
3,2,'王五','五王'
4,3,'李小','龙小李'
5,5,'Al','ecilA'
6,4,'陈B','boB陈'
7,5,'Бo','cиpoБ'
8,4,'Jö','gröJ'
9,3,'ひな','たなひ'
10,2,'서연','연서'
---- TYPES
INT,INT,STRING,STRING
====
---- QUERY
# Test utf8 functions in where clause.
set utf8_mode=true;
select id, name from functional.utf8_str_tiny
where length(name) = 2 and substring(name, 1, 1) = '李';
---- RESULTS: RAW_STRING
2,'李四'
---- TYPES
INT,STRING
====
---- QUERY
# Test utf8 functions in group by clause. group_concat() may produce undetermined results
# due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1);
---- RESULTS: RAW_STRING
'A',5
'ひ',3
'陈',4
'王',2
'张',2
'서',2
'J',4
'Б',5
'李',7
---- TYPES
STRING,INT
====
---- QUERY
# Test utf8 functions in group by and having clauses. group_concat() may produce
# undetermined results due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1)
having length(group_concat(name)) = 7;
---- RESULTS: RAW_STRING
'李',7
---- TYPES
STRING,INT
====
---- QUERY
# Each Chinese character is encoded into 3 bytes in UTF-8.
set utf8_mode=false;
select instr('最快的SQL引擎跑SQL', 'SQL'),
instr('最快的SQL引擎跑SQL', '引擎'),
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
instr('最快的SQL引擎跑SQL', '跑SQL'),
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
---- RESULTS
10,13,10,19,22,10
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select instr('最快的SQL引擎跑SQL', 'SQL'),
instr('最快的SQL引擎跑SQL', '引擎'),
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
instr('最快的SQL引擎跑SQL', '跑SQL'),
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
---- RESULTS
4,7,4,9,10,4
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
# Each Chinese character is encoded into 3 bytes in UTF-8.
set utf8_mode=false;
select locate('SQL', '最快的SQL引擎跑SQL'),
locate('引擎', '最快的SQL引擎跑SQL'),
locate('SQL引擎', '最快的SQL引擎跑SQL'),
locate('跑SQL', '最快的SQL引擎跑SQL'),
locate('SQL', '最快的SQL引擎跑SQL', 4),
locate('SQL', '最快的SQL引擎跑SQL', 11);
---- RESULTS
10,13,10,19,10,22
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select locate('SQL', '最快的SQL引擎跑SQL'),
locate('引擎', '最快的SQL引擎跑SQL'),
locate('SQL引擎', '最快的SQL引擎跑SQL'),
locate('跑SQL', '最快的SQL引擎跑SQL'),
locate('SQL', '最快的SQL引擎跑SQL', 4),
locate('SQL', '最快的SQL引擎跑SQL', 10);
---- RESULTS
4,7,4,9,4,10
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select mask('SQL引擎', 'x', 'x', 'x', 'x'),
mask_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_show_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_show_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x');
---- RESULTS: RAW_STRING
'xxxxx','SQLxx','SQxxx','xxL引擎','xxx引擎'
---- TYPES
STRING,STRING,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=false;
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'ABCD áäèü','abcd ÁÄÈÜ','Abcd áäèü Abcd ÁÄÈÜ'
---- TYPES
STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'ABCD ÁÄÈÜ','abcd áäèü','Abcd Áäèü Abcd Áäèü'
---- TYPES
STRING,STRING,STRING
====
---- QUERY
set utf8_mode=false;
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三','张三'
2,'李四','李四','李四'
3,'王五','王五','王五'
4,'李小龙','李小龙','李小龙'
5,'ALICE','alice','Alice'
6,'陈BOB','陈bob','陈bob'
7,'БOPиC','Бopиc','Бopиc'
8,'JöRG','jörg','Jörg'
9,'ひなた','ひなた','ひなた'
10,'서연','서연','서연'
---- TYPES
INT,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三','张三'
2,'李四','李四','李四'
3,'王五','王五','王五'
4,'李小龙','李小龙','李小龙'
5,'ALICE','alice','Alice'
6,'陈BOB','陈bob','陈bob'
7,'БOPИC','бopиc','Бopиc'
8,'JÖRG','jörg','Jörg'
9,'ひなた','ひなた','ひなた'
10,'서연','서연','서연'
---- TYPES
INT,STRING,STRING,STRING
====