mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
IMPALA-14237: Fix Iceberg partition values encoding
This patch modifies the string overload of IcebergFunctions::TruncatePartitionTransform so that it always handles strings as UTF-8-encoded ones, because the Iceberg specification states that that strings are UTF-8 encoded. Also, for an Iceberg table UrlEncode is called in not the Hive-compatible way, rather than the standard way, similar to Java's URLEncoder.encode() (which the Iceberg API also uses) to conform with existing practices by Hive, Spark and Trino. This included a change in the set of characters which are not escaped to follow the URL Standard's application/x-www-form-urlencoded format. [1] Also renamed it from ShouldNotEscape to IsUrlSafe for better readability. Testing: * add and extend e2e tests to check partitions with Unicode characters * add be tests to coding-util-test.cc [1]: https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set Change-Id: Iabb39727f6dd49b76c918bcd6b3ec62532555755 Reviewed-on: http://gerrit.cloudera.org:8080/23190 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
062ba4071a
commit
321429eac6
@@ -107,7 +107,12 @@ string TableSinkBase::GetPartitionName(int i) {
|
||||
|
||||
string TableSinkBase::UrlEncodePartitionValue(const string& raw_str) {
|
||||
string encoded_str;
|
||||
UrlEncode(raw_str, &encoded_str, true);
|
||||
if (IsIceberg()) {
|
||||
// Iceberg partition values should be URL encoded, but not Hive compatible way.
|
||||
UrlEncode(raw_str, &encoded_str, false);
|
||||
} else {
|
||||
UrlEncode(raw_str, &encoded_str, true);
|
||||
}
|
||||
return encoded_str.empty() ? table_desc_->null_partition_key_value() : encoded_str;
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#include "common/compiler-util.h"
|
||||
#include "common/logging.h"
|
||||
#include "exprs/string-functions.h"
|
||||
#include "runtime/timestamp-value.inline.h"
|
||||
#include "thirdparty/murmurhash/MurmurHash3.h"
|
||||
#include "udf/udf-internal.h"
|
||||
@@ -102,7 +103,9 @@ StringVal IcebergFunctions::TruncatePartitionTransform(FunctionContext* ctx,
|
||||
const StringVal& input, const IntVal& width) {
|
||||
if (!CheckInputsAndSetError(ctx, input, width)) return StringVal::null();
|
||||
if (input.len <= width.val) return input;
|
||||
return StringVal::CopyFrom(ctx, input.ptr, width.val);
|
||||
// String handled as UTF8 regardless of utf8_mode, because Iceberg spec states that
|
||||
// character strings must be stored as UTF-8 encoded byte arrays.
|
||||
return StringFunctions::Utf8Substring(ctx, input, 1, width.val);
|
||||
}
|
||||
|
||||
template<typename T, typename W>
|
||||
|
||||
@@ -117,14 +117,39 @@ TEST(UrlCodingTest, PathSeparators) {
|
||||
string encoded_test_path = "%2Fhome%2Fimpala%2Fdirectory%2F";
|
||||
TestUrl(test_path, encoded_test_path, false);
|
||||
TestUrl(test_path, encoded_test_path, true);
|
||||
string test = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r\x0E\x0F\x10"
|
||||
"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\"#%'"
|
||||
"*/:=?\\{[]^";
|
||||
string encoded_test = "SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"
|
||||
"%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%7F%22%23%25"
|
||||
"%27%2A%2F%3A%3D%3F%5C%7B%5B%5D%5E";
|
||||
TestUrl(test, encoded_test, false);
|
||||
TestUrl(test, encoded_test, true);
|
||||
}
|
||||
|
||||
// Test URL encoding of the ASCII table, character values from 1 to 127.
|
||||
TEST(UrlCodingTest, AsciiCharacters) {
|
||||
string raw = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D"
|
||||
"\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19"
|
||||
"\x1A\x1B\x1C\x1D\x1E\x1F !\"#$%&'()*+,-./"
|
||||
"0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7F";
|
||||
string hive_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
|
||||
"%0E%0F%10%11%12%13%14%15%16%17%18%19"
|
||||
"%1A%1B%1C%1D%1E%1F !%22%23$%25&%27()%2A+,-.%2F"
|
||||
"0123456789%3A;<%3D>%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"%5B%5C%5D%5E_`abcdefghijklmnopqrstuvwxyz%7B|}~%7F";
|
||||
TestUrl(raw, hive_encoded, true);
|
||||
string url_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
|
||||
"%0E%0F%10%11%12%13%14%15%16%17%18%19"
|
||||
"%1A%1B%1C%1D%1E%1F+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F"
|
||||
"0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D%7E%7F";
|
||||
TestUrl(raw, url_encoded, false);
|
||||
}
|
||||
|
||||
// Test a few unicode characters that are not in the ASCII table.
|
||||
TEST(UrlCodingTest, UnicodeCharacters) {
|
||||
string raw = "árvíztűrőtükörfúrógép 你们好 აბგ";
|
||||
string hive_encoded = "árvíztűrőtükörfúrógép 你们好 აბგ";
|
||||
TestUrl(raw, hive_encoded, true);
|
||||
string url_encoded = "%C3%A1rv%C3%ADzt%C5%B1r%C5%91"
|
||||
"t%C3%BCk%C3%B6rf%C3%BAr%C3%B3g%C3%A9p"
|
||||
"+%E4%BD%A0%E4%BB%AC%E5%A5%BD"
|
||||
"+%E1%83%90%E1%83%91%E1%83%92";
|
||||
TestUrl(raw, url_encoded, false);
|
||||
}
|
||||
|
||||
TEST(Base64Test, Basic) {
|
||||
|
||||
@@ -38,9 +38,9 @@ using std::uppercase;
|
||||
|
||||
namespace impala {
|
||||
|
||||
// It is more convenient to maintain the complement of the set of
|
||||
// characters to escape when not in Hive-compat mode.
|
||||
static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
|
||||
// It is more convenient to maintain the set of characters that are safe to use
|
||||
// directly in URLs without escaping
|
||||
static function<bool (char)> IsUrlSafe = is_any_of(".-*_");
|
||||
|
||||
// Hive selectively encodes characters. This is the whitelist of
|
||||
// characters it will encode.
|
||||
@@ -53,18 +53,27 @@ static const std::unordered_set<char> SpecialCharacters = {
|
||||
'\x1F', '\x7F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '{', '[', ']',
|
||||
'^'};
|
||||
|
||||
// Encodes the input string as a URL-encoded string based on UTF-8.
|
||||
// If 'hive_compat' is set to true, the string is encoded in a Hive-compatible way;
|
||||
// otherwise, a more standard URL encoding is used, similar to the URLEncoder.encode()
|
||||
// method in Java.
|
||||
static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) {
|
||||
stringstream ss;
|
||||
// "uppercase" and "hex" only affect the insertion of integers, not that of char values.
|
||||
ss << uppercase << hex << setfill('0');
|
||||
for (char ch : std::string_view(in, in_len)) {
|
||||
// Escape the character iff a) we are in Hive-compat mode and the
|
||||
// character is in the Hive whitelist or b) we are not in Hive-compat mode and
|
||||
// the character is not alphanumeric and it is not one of the characters specifically
|
||||
// excluded from escaping (see ShouldNotEscape()).
|
||||
// Escape the character iff
|
||||
// a) we are in Hive-compat mode and the character is in the Hive whitelist or
|
||||
// b) we are not in Hive-compat mode and the character is not alphanumeric
|
||||
// and it is not safe to use in URLs (see IsUrlSafe()).
|
||||
if ((hive_compat && SpecialCharacters.count(ch) > 0) || (!hive_compat &&
|
||||
!isalnum(static_cast<unsigned char>(ch)) && !ShouldNotEscape(ch))) {
|
||||
ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch));
|
||||
!isalnum(static_cast<unsigned char>(ch)) && !IsUrlSafe(ch))) {
|
||||
// Iff we are not in Hive-compat mode, we encode space as '+'.
|
||||
if (!hive_compat && ch == ' ') {
|
||||
ss << '+';
|
||||
} else {
|
||||
ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch));
|
||||
}
|
||||
} else {
|
||||
ss << ch;
|
||||
}
|
||||
|
||||
@@ -427,11 +427,11 @@ INT,BIGINT,DECIMAL,STRING
|
||||
---- QUERY
|
||||
show files in multi_col_truncate;
|
||||
---- RESULTS
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick brown/i_trunc=0/b_trunc=11/d_trunc=11111.100000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick brown/i_trunc=0/b_trunc=220/d_trunc=421.000000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick brown/i_trunc=5/b_trunc=330/d_trunc=113211.200000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick fox b/i_trunc=5/b_trunc=440/d_trunc=1111154.100000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick impal/i_trunc=15/b_trunc=550/d_trunc=9999913.200000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=0/b_trunc=11/d_trunc=11111.100000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=0/b_trunc=220/d_trunc=421.000000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=5/b_trunc=330/d_trunc=113211.200000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+fox\+b/i_trunc=5/b_trunc=440/d_trunc=1111154.100000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+impal/i_trunc=15/b_trunc=550/d_trunc=9999913.200000/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=__HIVE_DEFAULT_PARTITION__/i_trunc=__HIVE_DEFAULT_PARTITION__/b_trunc=__HIVE_DEFAULT_PARTITION__/d_trunc=__HIVE_DEFAULT_PARTITION__/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
---- TYPES
|
||||
STRING, STRING, STRING, STRING
|
||||
@@ -1010,7 +1010,7 @@ STRING,BIGINT,DECIMAL,TIMESTAMP,DATE
|
||||
show files in mixed_and_shuffled;
|
||||
---- RESULTS
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1971-07-01/da_year=1971/s_trunc=green/b_bucket=1/de_trunc=71.00/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1999-09-09/da_year=1999/s_trunc=pink /b_bucket=1/de_trunc=9.00/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1999-09-09/da_year=1999/s_trunc=pink\+/b_bucket=1/de_trunc=9.00/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2020-01-06/da_year=2020/s_trunc=quick/b_bucket=1/de_trunc=3333.00/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2021-01-01/da_year=2021/s_trunc=quick/b_bucket=1/de_trunc=543.00/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2021-01-01/da_year=2021/s_trunc=quick/b_bucket=2/de_trunc=123.00/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
@@ -1088,3 +1088,59 @@ where da is not null;
|
||||
---- TYPES
|
||||
STRING,BIGINT,DECIMAL,TIMESTAMP,DATE
|
||||
====
|
||||
---- QUERY
|
||||
# Test truncate partition transform with Unicode strings
|
||||
create table unicode_truncate (s string)
|
||||
partitioned by spec (truncate(5, s))
|
||||
stored as iceberg;
|
||||
====
|
||||
---- QUERY
|
||||
insert into unicode_truncate values
|
||||
('impala'),
|
||||
('árvíztűrőtükörfúrógép'),
|
||||
('árvíztűrő'),
|
||||
('űűű'),
|
||||
('你们好hello');
|
||||
select * from unicode_truncate;
|
||||
---- RESULTS: RAW_STRING
|
||||
'impala'
|
||||
'árvíztűrőtükörfúrógép'
|
||||
'árvíztűrő'
|
||||
'űűű'
|
||||
'你们好hello'
|
||||
---- TYPES
|
||||
STRING
|
||||
====
|
||||
---- QUERY
|
||||
show files in unicode_truncate;
|
||||
---- RESULTS
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=impal/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%C3%A1rv%C3%ADz/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%C5%B1%C5%B1%C5%B1/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%E4%BD%A0%E4%BB%AC%E5%A5%BDhe/.*.parq','.*','','$ERASURECODE_POLICY'
|
||||
---- TYPES
|
||||
STRING, STRING, STRING, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from unicode_truncate
|
||||
where s like "árvíz%";
|
||||
---- RESULTS: RAW_STRING
|
||||
'árvíztűrőtükörfúrógép'
|
||||
'árvíztűrő'
|
||||
---- TYPES
|
||||
STRING
|
||||
---- RUNTIME_PROFILE
|
||||
aggregation(SUM, RowsRead): 2
|
||||
aggregation(SUM, NumRowGroups): 4
|
||||
====
|
||||
---- QUERY
|
||||
select * from unicode_truncate
|
||||
where s = "űűű";
|
||||
---- RESULTS: RAW_STRING
|
||||
'űűű'
|
||||
---- TYPES
|
||||
STRING
|
||||
---- RUNTIME_PROFILE
|
||||
aggregation(SUM, RowsRead): 1
|
||||
aggregation(SUM, NumRowGroups): 1
|
||||
====
|
||||
|
||||
@@ -311,6 +311,25 @@ select * from unicode_partition_values;
|
||||
INT, STRING
|
||||
====
|
||||
---- QUERY
|
||||
show files in unicode_partition_values;
|
||||
---- RESULTS: RAW_STRING
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运/.*\.parq','.*','p=运','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运营业务数据1234567890!@%23\$%25%5E&%2A\(\)%7B}%5B%5D/.*\.parq','.*','p=运营业务数据1234567890!@%23\$%25%5E&%2A\(\)%7B}%5B%5D','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运营业务数据/.*\.parq','.*','p=运营业务数据','$ERASURECODE_POLICY'
|
||||
---- TYPES
|
||||
STRING, STRING, STRING, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from unicode_partition_values
|
||||
where p like '运%';
|
||||
---- RESULTS: RAW_STRING
|
||||
0,'运'
|
||||
0,'运营业务数据'
|
||||
0,'运营业务数据1234567890!@#$%^&*(){}[]'
|
||||
---- TYPES
|
||||
INT, STRING
|
||||
====
|
||||
---- QUERY
|
||||
create table unicode_partition_values_iceberg (id int, p string) partitioned by spec (identity(p)) stored by iceberg;
|
||||
---- RESULTS
|
||||
'Table has been created.'
|
||||
@@ -326,4 +345,26 @@ select * from unicode_partition_values_iceberg;
|
||||
0,'运营业务数据1234567890!@#$%^&*(){}[]'
|
||||
---- TYPES
|
||||
INT, STRING
|
||||
====
|
||||
---- QUERY
|
||||
show files in unicode_partition_values_iceberg;
|
||||
---- RESULTS
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90/.*\.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90%E8%90%A5%E4%B8%9A%E5%8A%A1%E6%95%B0%E6%8D%AE1234567890%21%40%23%24%25%5E%26\*%28%29%7B%7D%5B%5D/.*\.parq','.*','','$ERASURECODE_POLICY'
|
||||
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90%E8%90%A5%E4%B8%9A%E5%8A%A1%E6%95%B0%E6%8D%AE/.*\.parq','.*','','$ERASURECODE_POLICY'
|
||||
---- TYPES
|
||||
STRING, STRING, STRING, STRING
|
||||
====
|
||||
---- QUERY
|
||||
insert into unicode_partition_values_iceberg values (2, '运营业务数据');
|
||||
select * from unicode_partition_values_iceberg
|
||||
where p = '运营业务数据';
|
||||
---- RESULTS: RAW_STRING
|
||||
0,'运营业务数据'
|
||||
2,'运营业务数据'
|
||||
---- TYPES
|
||||
INT, STRING
|
||||
---- RUNTIME_PROFILE
|
||||
aggregation(SUM, RowsRead): 2
|
||||
aggregation(SUM, NumRowGroups): 2
|
||||
====
|
||||
@@ -333,7 +333,7 @@ class TestInsertPartKey(ImpalaTestSuite):
|
||||
"\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \
|
||||
"\x1C\x1D\x1E\x1F\"\x7F'%*/:=?\\{[]#^"
|
||||
part_dir = "p=SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%" \
|
||||
"10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25%2A" \
|
||||
"10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25*" \
|
||||
"%2F%3A%3D%3F%5C%7B%5B%5D%23%5E"
|
||||
show_part_value = "SpecialCharacters\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006" \
|
||||
"\\u0007\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F\\u0010" \
|
||||
|
||||
Reference in New Issue
Block a user