Files
impala/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
Mihaly Szjatinya 81f2673883 IMPALA-889: Add trim() function matching ANSI SQL definition
As agreed in JIRA discussions, the current PR extends existing TRIM
functionality with the support of SQL-standardized TRIM-FROM syntax:
TRIM({[LEADING / TRAILING / BOTH] | [STRING characters]} FROM expr).
Implemented based on the existing LTRIM / RTRIM / BTRIM family of
functions prepared earlier in IMPALA-6059 and extended for UTF-8 in
IMPALA-12718. Besides, partly based on abandoned PR
https://gerrit.cloudera.org/#/c/4474 and similar EXTRACT-FROM
functionality from https://github.com/apache/impala/commit/543fa73f3a846
f0e4527514c993cb0985912b06c.

Supported syntaxes:
Syntax #1 TRIM(<where> FROM <string>);
Syntax #2 TRIM(<charset> FROM <string>);
Syntax #3 TRIM(<where> <charset> FROM <string>);

"where": Case-insensitive trim direction. Valid options are "leading",
  "trailing", and "both". "leading" means trimming characters from the
  start; "trailing" means trimming characters from the end; "both" means
  trimming characters from both sides. For Syntax #2, since no "where"
  is specified, the option "both" is implied by default.

"charset": Case-sensitive characters to be removed. This argument is
  regarded as a character set going to be removed. The occurrence order
  of each character doesn't matter and duplicated instances of the same
  character will be ignored. NULL argument implies " " (standard space)
  by default. Empty argument ("" or '') makes TRIM return the string
  untouched. For Syntax #1, since no "charset" is specified, it trims
  " " (standard space) by default.

"string": Case-sensitive target string to trim. This argument can be
  NULL.

The UTF8_MODE query option is honored by TRIM-FROM, similarly to
existing TRIM().

UTF8_TRIM-FROM can be used to force UTF8 mode regardless of the query
option.

Design Notes:
1. No-BE. Since the existing LTRIM / RTRIM / BTRIM functions fully cover
all needed use-cases, no backend logic is required. This differs from
similar EXTRACT-FROM.

2. Syntax wrapper. TrimFromExpr class was introduced as a syntax
wrapper around FunctionCallExpr, which instantiates one of the regular
LTRIM / RTRIM / BTRIM functions. TrimFromExpr's role is to maintain
the integrity of the "phantom" TRIM-FROM built-in function.

3. No TRIM keyword. Following EXTRACT-FROM, no "TRIM" keyword was
added to the language. Although generally a keyword would allow easier
and better parsing, on the negative side it restricts token's usage in
general context. However, leading/trailing/both, being previously
saved as reserved words, are now added as keywords to make possible
their usage with no escaping.

Change-Id: I3c4fa6d0d8d0684c4b6d8dac8fd531d205e4f7b4
Reviewed-on: http://gerrit.cloudera.org:8080/21825
Reviewed-by: Csaba Ringhofer <csringhofer@cloudera.com>
Tested-by: Csaba Ringhofer <csringhofer@cloudera.com>
2024-12-02 15:15:15 +00:00

580 lines
13 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
====
---- QUERY
set utf8_mode=true;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
2,7,11
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=false;
select length('你好'), length('你好hello'), length('你好 hello 你好')
---- RESULTS
6,11,19
---- TYPES
INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你好h'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=false;
select substring('你好hello', 1, 3)
---- RESULTS: RAW_STRING
'你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select reverse('你好hello你好');
---- RESULTS: RAW_STRING
'好你olleh好你'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=off;
select id, length(name), substring(name, 1, 3), length(substring(name, 1, 3)) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,6,'张',3
2,6,'李',3
3,6,'王',3
4,9,'李',3
5,5,'Ali',3
6,6,'陈',3
7,7,'Бo',3
8,5,'Jö',3
9,9,'ひ',3
10,6,'서',3
---- TYPES
INT,INT,STRING,INT
====
---- QUERY
set utf8_mode=true;
select id, length(name), substring(name, 1, 2), reverse(name) from utf8_str_tiny
---- RESULTS: RAW_STRING
1,2,'张三','三张'
2,2,'李四','四李'
3,2,'王五','五王'
4,3,'李小','龙小李'
5,5,'Al','ecilA'
6,4,'陈B','boB陈'
7,5,'Бo','cиpoБ'
8,4,'Jö','gröJ'
9,3,'ひな','たなひ'
10,2,'서연','연서'
---- TYPES
INT,INT,STRING,STRING
====
---- QUERY
# Test utf8 functions in where clause.
set utf8_mode=true;
select id, name from functional.utf8_str_tiny
where length(name) = 2 and substring(name, 1, 1) = '李';
---- RESULTS: RAW_STRING
2,'李四'
---- TYPES
INT,STRING
====
---- QUERY
# Test utf8 functions in group by clause. group_concat() may produce undetermined results
# due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1);
---- RESULTS: RAW_STRING
'A',5
'ひ',3
'陈',4
'王',2
'张',2
'서',2
'J',4
'Б',5
'李',7
---- TYPES
STRING,INT
====
---- QUERY
# Test utf8 functions in group by and having clauses. group_concat() may produce
# undetermined results due to the order. Here we wrap it with length().
set utf8_mode=true;
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
group by substring(name, 1, 1)
having length(group_concat(name)) = 7;
---- RESULTS: RAW_STRING
'李',7
---- TYPES
STRING,INT
====
---- QUERY
# Each Chinese character is encoded into 3 bytes in UTF-8.
set utf8_mode=false;
select instr('最快的SQL引擎跑SQL', 'SQL'),
instr('最快的SQL引擎跑SQL', '引擎'),
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
instr('最快的SQL引擎跑SQL', '跑SQL'),
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
---- RESULTS
10,13,10,19,22,10
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select instr('最快的SQL引擎跑SQL', 'SQL'),
instr('最快的SQL引擎跑SQL', '引擎'),
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
instr('最快的SQL引擎跑SQL', '跑SQL'),
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
---- RESULTS
4,7,4,9,10,4
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
# Each Chinese character is encoded into 3 bytes in UTF-8.
set utf8_mode=false;
select locate('SQL', '最快的SQL引擎跑SQL'),
locate('引擎', '最快的SQL引擎跑SQL'),
locate('SQL引擎', '最快的SQL引擎跑SQL'),
locate('跑SQL', '最快的SQL引擎跑SQL'),
locate('SQL', '最快的SQL引擎跑SQL', 4),
locate('SQL', '最快的SQL引擎跑SQL', 11);
---- RESULTS
10,13,10,19,10,22
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select locate('SQL', '最快的SQL引擎跑SQL'),
locate('引擎', '最快的SQL引擎跑SQL'),
locate('SQL引擎', '最快的SQL引擎跑SQL'),
locate('跑SQL', '最快的SQL引擎跑SQL'),
locate('SQL', '最快的SQL引擎跑SQL', 4),
locate('SQL', '最快的SQL引擎跑SQL', 10);
---- RESULTS
4,7,4,9,4,10
---- TYPES
INT,INT,INT,INT,INT,INT
====
---- QUERY
set utf8_mode=true;
select mask('SQL引擎', 'x', 'x', 'x', 'x'),
mask_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_show_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
mask_show_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x');
---- RESULTS: RAW_STRING
'xxxxx','SQLxx','SQxxx','xxL引擎','xxx引擎'
---- TYPES
STRING,STRING,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select mask('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'xxxx xxxx XXXX XXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('Ich möchte ein Bier. Tschüss');
---- RESULTS: RAW_STRING
'Xxx xxxxxx xxx Xxxx. Xxxxxxx'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('Hungarian áéíöóőüúű ÁÉÍÖÓŐÜÚŰ');
---- RESULTS: RAW_STRING
'Xxxxxxxxx xxxxxxxxx XXXXXXXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('German äöüß ÄÖÜẞ');
---- RESULTS: RAW_STRING
'Xxxxxx xxxx XXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('French àâæçéèêëïîôœùûüÿ ÀÂÆÇÉÈÊËÏÎÔŒÙÛÜŸ');
---- RESULTS: RAW_STRING
'Xxxxxx xxxxxxxxxxxxxxxx XXXXXXXXXXXXXXXX'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask('Greek αβξδ άέήώ ΑΒΞΔ ΆΈΉΏ 1234');
---- RESULTS: RAW_STRING
'Xxxxx xxxx xxxx XXXX XXXX nnnn'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_first_n('áéíöóőüúű');
---- RESULTS: RAW_STRING
'xxxxóőüúű'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_show_first_n('áéíöóőüúű');
---- RESULTS: RAW_STRING
'áéíöxxxxx'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_last_n('áéíöóőüúű');
---- RESULTS: RAW_STRING
'áéíöóxxxx'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=true;
select mask_show_last_n('áéíöóőüúű')
---- RESULTS: RAW_STRING
'xxxxxőüúű'
---- TYPES
STRING
====
---- QUERY
set utf8_mode=false;
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'ABCD áäèü','abcd ÁÄÈÜ','Abcd áäèü Abcd ÁÄÈÜ'
---- TYPES
STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING
'ABCD ÁÄÈÜ','abcd áäèü','Abcd Áäèü Abcd Áäèü'
---- TYPES
STRING,STRING,STRING
====
---- QUERY
set utf8_mode=false;
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三','张三'
2,'李四','李四','李四'
3,'王五','王五','王五'
4,'李小龙','李小龙','李小龙'
5,'ALICE','alice','Alice'
6,'陈BOB','陈bob','陈bob'
7,'БOPиC','Бopиc','Бopиc'
8,'JöRG','jörg','Jörg'
9,'ひなた','ひなた','ひなた'
10,'서연','서연','서연'
---- TYPES
INT,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三','张三'
2,'李四','李四','李四'
3,'王五','王五','王五'
4,'李小龙','李小龙','李小龙'
5,'ALICE','alice','Alice'
6,'陈BOB','陈bob','陈bob'
7,'БOPИC','бopиc','Бopиc'
8,'JÖRG','jörg','Jörg'
9,'ひなた','ひなた','ひなた'
10,'서연','서연','서연'
---- TYPES
INT,STRING,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, ltrim(name, substr(name, 1, 1)) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','三'
2,'李四','四'
3,'王五','五'
4,'李小龙','小龙'
5,'Alice','lice'
6,'陈Bob','Bob'
7,'Бopиc','opиc'
8,'Jörg','örg'
9,'ひなた','なた'
10,'서연','연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, trim(leading substr(name, 1, 1) from name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','三'
2,'李四','四'
3,'王五','五'
4,'李小龙','小龙'
5,'Alice','lice'
6,'陈Bob','Bob'
7,'Бopиc','opиc'
8,'Jörg','örg'
9,'ひなた','なた'
10,'서연','연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, rtrim(name, substr(name, 2)) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张'
2,'李四','李'
3,'王五','王'
4,'李小龙','李'
5,'Alice','A'
6,'陈Bob','陈'
7,'Бopиc','Б'
8,'Jörg','J'
9,'ひなた','ひ'
10,'서연','서'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, trim(trailing substr(name, 2) from name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张'
2,'李四','李'
3,'王五','王'
4,'李小龙','李'
5,'Alice','A'
6,'陈Bob','陈'
7,'Бopиc','Б'
8,'Jörg','J'
9,'ひなた','ひ'
10,'서연','서'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, ltrim(name, '张李王小A陈БJひ서') from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','三'
2,'李四','四'
3,'王五','五'
4,'李小龙','龙'
5,'Alice','lice'
6,'陈Bob','Bob'
7,'Бopиc','opиc'
8,'Jörg','örg'
9,'ひなた','なた'
10,'서연','연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, trim(leading '张李王小A陈БJひ서' from name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','三'
2,'李四','四'
3,'王五','五'
4,'李小龙','龙'
5,'Alice','lice'
6,'陈Bob','Bob'
7,'Бopиc','opиc'
8,'Jörg','örg'
9,'ひなた','なた'
10,'서연','연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, rtrim(name, '三四五小龙') from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张'
2,'李四','李'
3,'王五','王'
4,'李小龙','李'
5,'Alice','Alice'
6,'陈Bob','陈Bob'
7,'Бopиc','Бopиc'
8,'Jörg','Jörg'
9,'ひなた','ひなた'
10,'서연','서연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, trim(trailing '三四五小龙' from name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张'
2,'李四','李'
3,'王五','王'
4,'李小龙','李'
5,'Alice','Alice'
6,'陈Bob','陈Bob'
7,'Бopиc','Бopиc'
8,'Jörg','Jörg'
9,'ひなた','ひなた'
10,'서연','서연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, btrim(name, '!?。,:;‘’“”≠≥≤∞ε∑∫√') from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三'
2,'李四','李四'
3,'王五','王五'
4,'李小龙','李小龙'
5,'Alice','Alice'
6,'陈Bob','陈Bob'
7,'Бopиc','Бopиc'
8,'Jörg','Jörg'
9,'ひなた','ひなた'
10,'서연','서연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, trim('!?。,:;‘’“”≠≥≤∞ε∑∫√' from name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三'
2,'李四','李四'
3,'王五','王五'
4,'李小龙','李小龙'
5,'Alice','Alice'
6,'陈Bob','陈Bob'
7,'Бopиc','Бopиc'
8,'Jörg','Jörg'
9,'ひなた','ひなた'
10,'서연','서연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=false;
select id, name, utf8_trim('!?。,:;‘’“”≠≥≤∞ε∑∫√' from name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三'
2,'李四','李四'
3,'王五','王五'
4,'李小龙','李小龙'
5,'Alice','Alice'
6,'陈Bob','陈Bob'
7,'Бopиc','Бopиc'
8,'Jörg','Jörg'
9,'ひなた','ひなた'
10,'서연','서연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=false;
select id, name, utf8_trim('!?。,:;‘’“”≠≥≤∞ε∑∫√' from name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三','张三'
2,'李四','李四'
3,'王五','王五'
4,'李小龙','李小龙'
5,'Alice','Alice'
6,'陈Bob','陈Bob'
7,'Бopиc','Бopиc'
8,'Jörg','Jörg'
9,'ひなた','ひなた'
10,'서연','서연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, unnest(arr2) as item, trim(unnest(arr2) from "aaYYBBoo") from complextypes_arrays order by id, item;
---- RESULTS: RAW_STRING
1,'five','aaYYBBoo'
1,'four','aaYYBB'
1,'one','aaYYBB'
1,'three','aaYYBBoo'
1,'two','aaYYBB'
2,'five','aaYYBBoo'
2,'one','aaYYBB'
2,'three','aaYYBBoo'
2,'two','aaYYBB'
2,'NULL','aaYYBBoo'
3,'ten','aaYYBBoo'
4,'eight','aaYYBBoo'
4,'nine','aaYYBBoo'
4,'ten','aaYYBBoo'
5,'eleven','aaYYBBoo'
5,'ten','aaYYBBoo'
5,'thirteen','aaYYBBoo'
5,'twelve','aaYYBBoo'
6,'str1','aaYYBBoo'
6,'str2','aaYYBBoo'
9,'str1','aaYYBBoo'
9,'str2','aaYYBBoo'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, trim(first_value(name) over (order by id) from name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三',''
2,'李四','李四'
3,'王五','王五'
4,'李小龙','李小龙'
5,'Alice','Alice'
6,'陈Bob','陈Bob'
7,'Бopиc','Бopиc'
8,'Jörg','Jörg'
9,'ひなた','ひなた'
10,'서연','서연'
---- TYPES
INT,STRING,STRING
====
---- QUERY
set utf8_mode=true;
select id, name, trim(name from name) from utf8_str_tiny;
---- RESULTS: RAW_STRING
1,'张三',''
2,'李四',''
3,'王五',''
4,'李小龙',''
5,'Alice',''
6,'陈Bob',''
7,'Бopиc',''
8,'Jörg',''
9,'ひなた',''
10,'서연',''
---- TYPES
INT,STRING,STRING
====