Files
impala/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
pranavyl 85cd07a11e IMPALA-11499: Refactor UrlEncode function to handle special characters
An error came from an issue with URL encoding, where certain Unicode
characters were being incorrectly encoded due to their UTF-8
representation matching characters in the set of characters to escape.
For example, the string '运', which consists of three bytes
0xe8 0xbf 0x90 was wrongly getting encoded into '\E8%FFFFFFBF\90',
because the middle byte matched one of the two bytes that
represented the "\u00FF" literal. Inclusion of "\u00FF" was likely
a mistake from the beginning and it should have been '\x7F'.

The patch makes three key changes:
1. Before the change, the set of characters that need to be escaped
was stored as a string. The current patch uses an unordered_set
instead.

2. '\xFF', which is an invalid UTF-8 byte and whose inclusion was
erroneous from the beginning, is replaced with '\x7F', which is a
control character for DELETE, ensuring consistency and correctness in
URL encoding.

3. The list of characters to be escaped is extended to match the
current list in Hive.

Testing: Tests on both traditional Hive tables and Iceberg tables
are included in unicode-column-name.test, insert.test,
coding-util-test.cc and test_insert.py.

Change-Id: I88c4aba5d811dfcec809583d0c16fcbc0ca730fb
Reviewed-on: http://gerrit.cloudera.org:8080/21131
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2024-05-09 15:09:21 +00:00

329 lines
7.3 KiB
Plaintext

====
---- QUERY
create table testtbl1(`세율대분류구분코드` int, s string COMMENT 'String col') stored as TEXTFILE;
---- RESULTS
'Table has been created.'
====
---- QUERY
# Make sure creating a table with the same name doesn't throw an error when
# IF NOT EXISTS is specified.
create table if not exists testtbl1(`세율대분류구분코드` int, s string)
STORED AS TEXTFILE;
---- RESULTS
'Table already exists.'
====
---- QUERY
show tables;
---- RESULTS
'testtbl1'
---- TYPES
STRING
====
---- QUERY
describe testtbl1;
---- RESULTS: RAW_STRING
'세율대분류구분코드','int',''
's','string','String col'
---- TYPES
STRING, STRING, STRING
====
---- QUERY
insert into table testtbl1 values(1, 'Alice');
====
---- QUERY
select * from testtbl1;
---- RESULTS
1,'Alice'
---- TYPES
INT, STRING
====
---- QUERY
drop table testtbl1;
---- RESULTS
'Table has been dropped.'
====
---- QUERY
create table testtbl_kudu(`시스템처리최종사용자번호a` int, s string COMMENT 'String col',
Primary key(`시스템처리최종사용자번호a`)) stored as KUDU;
---- RESULTS
'Table has been created.'
====
---- QUERY
show tables;
---- RESULTS
'testtbl_kudu'
---- TYPES
STRING
====
---- QUERY
describe testtbl_kudu;
---- RESULTS: RAW_STRING
'시스템처리최종사용자번호a','int','','true','true','false','','AUTO_ENCODING','DEFAULT_COMPRESSION','0'
's','string','String col','false','','true','','AUTO_ENCODING','DEFAULT_COMPRESSION','0'
---- TYPES
STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING
====
---- QUERY
insert into table testtbl_kudu values(1, 'Alice');
====
---- QUERY
select * from testtbl_kudu;
---- RESULTS
1,'Alice'
---- TYPES
INT, STRING
====
---- QUERY
drop table testtbl_kudu;
---- RESULTS
'Table has been dropped.'
====
---- QUERY
create table testtbl_iceberg(`我` int, s string COMMENT 'String col') stored as ICEBERG;
---- RESULTS
'Table has been created.'
====
---- QUERY
# Make sure creating a table with the same name doesn't throw an error when
# IF NOT EXISTS is specified.
create table if not exists testtbl_iceberg(`我` int, s string)
STORED AS ICEBERG;
---- RESULTS
'Table already exists.'
====
---- QUERY
show tables;
---- RESULTS
'testtbl_iceberg'
---- TYPES
STRING
====
---- QUERY
describe testtbl_iceberg;
---- RESULTS: RAW_STRING
'我','int','','true'
's','string','String col','true'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
insert into table testtbl_iceberg values(1, 'Alice');
====
---- QUERY
select * from testtbl_iceberg;
---- RESULTS
1,'Alice'
---- TYPES
INT, STRING
====
---- QUERY
drop table testtbl_iceberg;
---- RESULTS
'Table has been dropped.'
====
---- QUERY
create table $DATABASE.testtbl_orc(`我` int, s string COMMENT 'String col') stored as ORC;
---- RESULTS
'Table has been created.'
====
---- QUERY
# Make sure creating a table with the same name doesn't throw an error when
# IF NOT EXISTS is specified.
create table if not exists $DATABASE.testtbl_orc(`我` int, s string)
STORED AS ORC;
---- RESULTS
'Table already exists.'
====
---- QUERY
show tables;
---- RESULTS
'testtbl_orc'
---- TYPES
STRING
====
---- QUERY
describe $DATABASE.testtbl_orc;
---- RESULTS: RAW_STRING
'我','int',''
's','string','String col'
---- TYPES
STRING, STRING, STRING
====
---- IS_HDFS_ONLY
---- HIVE_QUERY
insert into table $DATABASE.testtbl_orc values(1, 'Alice');
====
---- IS_HDFS_ONLY
---- HIVE_QUERY
select * from $DATABASE.testtbl_orc;
---- RESULTS
1,'Alice'
---- TYPES
INT, STRING
====
---- QUERY
drop table $DATABASE.testtbl_orc;
---- RESULTS
'Table has been dropped.'
====
---- QUERY
create table $DATABASE.testtbl_avro(`我` int, s string COMMENT 'String col') stored as AVRO;
---- RESULTS
'Table has been created.'
====
---- QUERY
# Make sure creating a table with the same name doesn't throw an error when
# IF NOT EXISTS is specified.
create table if not exists $DATABASE.testtbl_avro(`我` int, s string)
STORED AS AVRO;
---- RESULTS
'Table already exists.'
====
---- QUERY
show tables;
---- RESULTS
'testtbl_avro'
---- TYPES
STRING
====
---- QUERY
describe $DATABASE.testtbl_avro;
---- RESULTS: RAW_STRING
'我','int','from deserializer'
's','string','String col'
---- TYPES
STRING, STRING, STRING
====
---- IS_HDFS_ONLY
---- HIVE_QUERY
insert into table $DATABASE.testtbl_avro values(1, 'Alice');
====
---- IS_HDFS_ONLY
---- HIVE_QUERY
select * from $DATABASE.testtbl_avro;
---- RESULTS
1,'Alice'
---- TYPES
INT, STRING
====
---- QUERY
drop table $DATABASE.testtbl_avro;
---- RESULTS
'Table has been dropped.'
====
---- QUERY
create table testtbl_part(`我` int, s string) PARTITIONED BY (`고객` int comment 'C') stored as TEXTFILE;
---- RESULTS
'Table has been created.'
====
---- QUERY
# Partition columns are displayed as part of DESCRIBE <table>
describe testtbl_part;
---- RESULTS: RAW_STRING
'我','int',''
's','string',''
'고객','int','C'
---- TYPES
STRING, STRING, STRING
====
---- QUERY
insert into table testtbl_part partition(`고객`=24) values(1, 'Alice');
====
---- QUERY
insert into table testtbl_part partition(`고객`=20) values(2, 'Alison');
====
---- QUERY
insert into table testtbl_part partition(`고객`=23) values(3, 'Zack');
====
---- QUERY
select * from testtbl_part;
---- RESULTS
1,'Alice',24
2,'Alison',20
3,'Zack',23
---- TYPES
INT, STRING, INT
====
---- QUERY
drop table testtbl_part;
---- RESULTS
'Table has been dropped.'
====
---- QUERY
create table testtbl_parquet(`我` int, s string COMMENT 'String col') stored as PARQUET;
---- RESULTS
'Table has been created.'
====
---- QUERY
# Make sure creating a table with the same name doesn't throw an error when
# IF NOT EXISTS is specified.
create table if not exists testtbl_parquet(`我` int, s string)
STORED AS PARQUET;
---- RESULTS
'Table already exists.'
====
---- QUERY
show tables;
---- RESULTS
'testtbl_parquet'
---- TYPES
STRING
====
---- QUERY
describe testtbl_parquet;
---- RESULTS: RAW_STRING
'我','int',''
's','string','String col'
---- TYPES
STRING, STRING, STRING
====
---- QUERY
insert into table testtbl_parquet values(1, 'Alice');
====
---- QUERY
select * from testtbl_parquet;
---- RESULTS
1,'Alice'
---- TYPES
INT, STRING
====
---- QUERY
drop table testtbl_parquet;
---- RESULTS
'Table has been dropped.'
====
---- QUERY
# Tests for IMPALA-11499
create table unicode_partition_values (id int) partitioned by (p string) stored as parquet;
---- RESULTS
'Table has been created.'
====
---- QUERY
insert into unicode_partition_values partition(p='运营业务数据') values (0);
insert into unicode_partition_values partition(p='运') values (0);
insert into unicode_partition_values partition(p='运营业务数据1234567890!@#$%^&*(){}[]') values (0);
select * from unicode_partition_values;
---- RESULTS: RAW_STRING
0,'运'
0,'运营业务数据'
0,'运营业务数据1234567890!@#$%^&*(){}[]'
---- TYPES
INT, STRING
====
---- QUERY
create table unicode_partition_values_iceberg (id int, p string) partitioned by spec (identity(p)) stored by iceberg;
---- RESULTS
'Table has been created.'
====
---- QUERY
insert into unicode_partition_values_iceberg values (0, '运营业务数据');
insert into unicode_partition_values_iceberg values (0, '运');
insert into unicode_partition_values_iceberg values (0, '运营业务数据1234567890!@#$%^&*(){}[]');
select * from unicode_partition_values_iceberg;
---- RESULTS: RAW_STRING
0,'运'
0,'运营业务数据'
0,'运营业务数据1234567890!@#$%^&*(){}[]'
---- TYPES
INT, STRING
====