mirror of
https://github.com/apache/impala.git
synced 2026-01-24 15:00:45 -05:00
Mask functions are used in Ranger column masking policies to mask
sensitive data. There are 5 mask functions: mask(), mask_first_n(),
mask_last_n(), mask_show_first_n(), mask_show_last_n(). Take mask() as
an example, by default, it will mask uppercase to 'X', lowercase to 'x',
digits to 'n' and leave other characters unmasked. For masking all
characters to '*', we can use
mask(my_col, '*', '*', '*', '*');
The current implementations mask strings byte-to-byte, which have
inconsistent results with Hive when the string contains unicode
characters:
mask('中国', '*', '*', '*', '*') => '******'
Each Chinese character is encoded into 3 bytes in UTF-8 so we get the
above result. The result in Hive is '**' since there are two Chinese
characters.
This patch provides consistent masking behavior with Hive for
strings under the UTF-8 mode, i.e., set UTF8_MODE=true. In UTF-8 mode,
the masked unit of a string is a unicode code point.
Implementation
- Extends the existing MaskTransform function to deal with unicode code
points(represented by uint32_t).
- Extends the existing GetFirstChar function to get the code point of
given masked charactors in UTF-8 mode.
- Implement a MaskSubStrUtf8 method as the core functionality.
- Swith to use MaskSubStrUtf8 instead of MaskSubStr in UTF-8 mode.
- For better testing, this patch also adds an overload for all mask
functions for only masking other chars but keeping the
upper/lower/digit chars unmasked. E.g. mask({col}, -1, -1, -1, 'X').
Tests
- Add BE tests in expr-test
- Add e2e tests in utf8-string-functions.test
Change-Id: I1276eccc94c9528507349b155a51e76f338367d5
Reviewed-on: http://gerrit.cloudera.org:8080/17780
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
183 lines
4.6 KiB
Plaintext
183 lines
4.6 KiB
Plaintext
====
|
|
---- QUERY
|
|
set utf8_mode=true;
|
|
select length('你好'), length('你好hello'), length('你好 hello 你好')
|
|
---- RESULTS
|
|
2,7,11
|
|
---- TYPES
|
|
INT,INT,INT
|
|
====
|
|
---- QUERY
|
|
set utf8_mode=false;
|
|
select length('你好'), length('你好hello'), length('你好 hello 你好')
|
|
---- RESULTS
|
|
6,11,19
|
|
---- TYPES
|
|
INT,INT,INT
|
|
====
|
|
---- QUERY
|
|
set utf8_mode=true;
|
|
select substring('你好hello', 1, 3)
|
|
---- RESULTS: RAW_STRING
|
|
'你好h'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
set utf8_mode=false;
|
|
select substring('你好hello', 1, 3)
|
|
---- RESULTS: RAW_STRING
|
|
'你'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
set utf8_mode=true;
|
|
select reverse('你好hello你好');
|
|
---- RESULTS: RAW_STRING
|
|
'好你olleh好你'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
set utf8_mode=off;
|
|
select id, length(name), substring(name, 1, 3), length(substring(name, 1, 3)) from utf8_str_tiny
|
|
---- RESULTS: RAW_STRING
|
|
1,6,'张',3
|
|
2,6,'李',3
|
|
3,6,'王',3
|
|
4,9,'李',3
|
|
5,5,'Ali',3
|
|
6,6,'陈',3
|
|
7,7,'Бo',3
|
|
8,5,'Jö',3
|
|
9,9,'ひ',3
|
|
10,6,'서',3
|
|
---- TYPES
|
|
INT,INT,STRING,INT
|
|
====
|
|
---- QUERY
|
|
set utf8_mode=true;
|
|
select id, length(name), substring(name, 1, 2), reverse(name) from utf8_str_tiny
|
|
---- RESULTS: RAW_STRING
|
|
1,2,'张三','三张'
|
|
2,2,'李四','四李'
|
|
3,2,'王五','五王'
|
|
4,3,'李小','龙小李'
|
|
5,5,'Al','ecilA'
|
|
6,4,'陈B','boB陈'
|
|
7,5,'Бo','cиpoБ'
|
|
8,4,'Jö','gröJ'
|
|
9,3,'ひな','たなひ'
|
|
10,2,'서연','연서'
|
|
---- TYPES
|
|
INT,INT,STRING,STRING
|
|
====
|
|
---- QUERY
|
|
# Test utf8 functions in where clause.
|
|
set utf8_mode=true;
|
|
select id, name from functional.utf8_str_tiny
|
|
where length(name) = 2 and substring(name, 1, 1) = '李';
|
|
---- RESULTS: RAW_STRING
|
|
2,'李四'
|
|
---- TYPES
|
|
INT,STRING
|
|
====
|
|
---- QUERY
|
|
# Test utf8 functions in group by clause. group_concat() may produce undetermined results
|
|
# due to the order. Here we wrap it with length().
|
|
set utf8_mode=true;
|
|
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
|
|
group by substring(name, 1, 1);
|
|
---- RESULTS: RAW_STRING
|
|
'A',5
|
|
'ひ',3
|
|
'陈',4
|
|
'王',2
|
|
'张',2
|
|
'서',2
|
|
'J',4
|
|
'Б',5
|
|
'李',7
|
|
---- TYPES
|
|
STRING,INT
|
|
====
|
|
---- QUERY
|
|
# Test utf8 functions in group by and having clauses. group_concat() may produce
|
|
# undetermined results due to the order. Here we wrap it with length().
|
|
set utf8_mode=true;
|
|
select substring(name, 1, 1), length(group_concat(name)) from functional.utf8_str_tiny
|
|
group by substring(name, 1, 1)
|
|
having length(group_concat(name)) = 7;
|
|
---- RESULTS: RAW_STRING
|
|
'李',7
|
|
---- TYPES
|
|
STRING,INT
|
|
====
|
|
---- QUERY
|
|
# Each Chinese character is encoded into 3 bytes in UTF-8.
|
|
set utf8_mode=false;
|
|
select instr('最快的SQL引擎跑SQL', 'SQL'),
|
|
instr('最快的SQL引擎跑SQL', '引擎'),
|
|
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
|
|
instr('最快的SQL引擎跑SQL', '跑SQL'),
|
|
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
|
|
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
|
|
---- RESULTS
|
|
10,13,10,19,22,10
|
|
---- TYPES
|
|
INT,INT,INT,INT,INT,INT
|
|
====
|
|
---- QUERY
|
|
set utf8_mode=true;
|
|
select instr('最快的SQL引擎跑SQL', 'SQL'),
|
|
instr('最快的SQL引擎跑SQL', '引擎'),
|
|
instr('最快的SQL引擎跑SQL', 'SQL引擎'),
|
|
instr('最快的SQL引擎跑SQL', '跑SQL'),
|
|
instr('最快的SQL引擎跑SQL', 'SQL', 1, 2),
|
|
instr('最快的SQL引擎跑SQL', 'SQL', -1, 2);
|
|
---- RESULTS
|
|
4,7,4,9,10,4
|
|
---- TYPES
|
|
INT,INT,INT,INT,INT,INT
|
|
====
|
|
---- QUERY
|
|
# Each Chinese character is encoded into 3 bytes in UTF-8.
|
|
set utf8_mode=false;
|
|
select locate('SQL', '最快的SQL引擎跑SQL'),
|
|
locate('引擎', '最快的SQL引擎跑SQL'),
|
|
locate('SQL引擎', '最快的SQL引擎跑SQL'),
|
|
locate('跑SQL', '最快的SQL引擎跑SQL'),
|
|
locate('SQL', '最快的SQL引擎跑SQL', 4),
|
|
locate('SQL', '最快的SQL引擎跑SQL', 11);
|
|
---- RESULTS
|
|
10,13,10,19,10,22
|
|
---- TYPES
|
|
INT,INT,INT,INT,INT,INT
|
|
====
|
|
---- QUERY
|
|
set utf8_mode=true;
|
|
select locate('SQL', '最快的SQL引擎跑SQL'),
|
|
locate('引擎', '最快的SQL引擎跑SQL'),
|
|
locate('SQL引擎', '最快的SQL引擎跑SQL'),
|
|
locate('跑SQL', '最快的SQL引擎跑SQL'),
|
|
locate('SQL', '最快的SQL引擎跑SQL', 4),
|
|
locate('SQL', '最快的SQL引擎跑SQL', 10);
|
|
---- RESULTS
|
|
4,7,4,9,4,10
|
|
---- TYPES
|
|
INT,INT,INT,INT,INT,INT
|
|
====
|
|
---- QUERY
|
|
set utf8_mode=true;
|
|
select mask('SQL引擎', 'x', 'x', 'x', 'x'),
|
|
mask_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
|
|
mask_show_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
|
|
mask_first_n('SQL引擎', 2, 'x', 'x', 'x', 'x'),
|
|
mask_show_last_n('SQL引擎', 2, 'x', 'x', 'x', 'x');
|
|
---- RESULTS: RAW_STRING
|
|
'xxxxx','SQLxx','SQxxx','xxL引擎','xxx引擎'
|
|
---- TYPES
|
|
STRING,STRING,STRING,STRING,STRING
|
|
====
|