IMPALA-14237: Fix Iceberg partition values encoding

This patch modifies the string overload of
IcebergFunctions::TruncatePartitionTransform so that it always handles
strings as UTF-8-encoded ones, because the Iceberg specification states
that that strings are UTF-8 encoded.

Also, for an Iceberg table UrlEncode is called in not the
Hive-compatible way, rather than the standard way, similar to Java's
URLEncoder.encode() (which the Iceberg API also uses) to conform with
existing practices by Hive, Spark and Trino. This included a change in
the set of characters which are not escaped to follow the URL Standard's
application/x-www-form-urlencoded format. [1] Also renamed it from
ShouldNotEscape to IsUrlSafe for better readability.

Testing:
 * add and extend e2e tests to check partitions with Unicode characters
 * add be tests to coding-util-test.cc

[1]: https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set

Change-Id: Iabb39727f6dd49b76c918bcd6b3ec62532555755
Reviewed-on: http://gerrit.cloudera.org:8080/23190
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Daniel Vanko
2025-07-18 15:56:17 +02:00
committed by Impala Public Jenkins
parent 062ba4071a
commit 321429eac6
7 changed files with 165 additions and 26 deletions

View File

@@ -107,7 +107,12 @@ string TableSinkBase::GetPartitionName(int i) {
string TableSinkBase::UrlEncodePartitionValue(const string& raw_str) {
string encoded_str;
UrlEncode(raw_str, &encoded_str, true);
if (IsIceberg()) {
// Iceberg partition values should be URL encoded, but not Hive compatible way.
UrlEncode(raw_str, &encoded_str, false);
} else {
UrlEncode(raw_str, &encoded_str, true);
}
return encoded_str.empty() ? table_desc_->null_partition_key_value() : encoded_str;
}

View File

@@ -22,6 +22,7 @@
#include "common/compiler-util.h"
#include "common/logging.h"
#include "exprs/string-functions.h"
#include "runtime/timestamp-value.inline.h"
#include "thirdparty/murmurhash/MurmurHash3.h"
#include "udf/udf-internal.h"
@@ -102,7 +103,9 @@ StringVal IcebergFunctions::TruncatePartitionTransform(FunctionContext* ctx,
const StringVal& input, const IntVal& width) {
if (!CheckInputsAndSetError(ctx, input, width)) return StringVal::null();
if (input.len <= width.val) return input;
return StringVal::CopyFrom(ctx, input.ptr, width.val);
// String handled as UTF8 regardless of utf8_mode, because Iceberg spec states that
// character strings must be stored as UTF-8 encoded byte arrays.
return StringFunctions::Utf8Substring(ctx, input, 1, width.val);
}
template<typename T, typename W>

View File

@@ -117,14 +117,39 @@ TEST(UrlCodingTest, PathSeparators) {
string encoded_test_path = "%2Fhome%2Fimpala%2Fdirectory%2F";
TestUrl(test_path, encoded_test_path, false);
TestUrl(test_path, encoded_test_path, true);
string test = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r\x0E\x0F\x10"
"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\"#%'"
"*/:=?\\{[]^";
string encoded_test = "SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"
"%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%7F%22%23%25"
"%27%2A%2F%3A%3D%3F%5C%7B%5B%5D%5E";
TestUrl(test, encoded_test, false);
TestUrl(test, encoded_test, true);
}
// Test URL encoding of the ASCII table, character values from 1 to 127.
TEST(UrlCodingTest, AsciiCharacters) {
string raw = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D"
"\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19"
"\x1A\x1B\x1C\x1D\x1E\x1F !\"#$%&'()*+,-./"
"0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7F";
string hive_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
"%0E%0F%10%11%12%13%14%15%16%17%18%19"
"%1A%1B%1C%1D%1E%1F !%22%23$%25&%27()%2A+,-.%2F"
"0123456789%3A;<%3D>%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"%5B%5C%5D%5E_`abcdefghijklmnopqrstuvwxyz%7B|}~%7F";
TestUrl(raw, hive_encoded, true);
string url_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
"%0E%0F%10%11%12%13%14%15%16%17%18%19"
"%1A%1B%1C%1D%1E%1F+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F"
"0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D%7E%7F";
TestUrl(raw, url_encoded, false);
}
// Test a few unicode characters that are not in the ASCII table.
TEST(UrlCodingTest, UnicodeCharacters) {
string raw = "árvíztűrőtükörfúrógép 你们好 აბგ";
string hive_encoded = "árvíztűrőtükörfúrógép 你们好 აბგ";
TestUrl(raw, hive_encoded, true);
string url_encoded = "%C3%A1rv%C3%ADzt%C5%B1r%C5%91"
"t%C3%BCk%C3%B6rf%C3%BAr%C3%B3g%C3%A9p"
"+%E4%BD%A0%E4%BB%AC%E5%A5%BD"
"+%E1%83%90%E1%83%91%E1%83%92";
TestUrl(raw, url_encoded, false);
}
TEST(Base64Test, Basic) {

View File

@@ -38,9 +38,9 @@ using std::uppercase;
namespace impala {
// It is more convenient to maintain the complement of the set of
// characters to escape when not in Hive-compat mode.
static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
// It is more convenient to maintain the set of characters that are safe to use
// directly in URLs without escaping
static function<bool (char)> IsUrlSafe = is_any_of(".-*_");
// Hive selectively encodes characters. This is the whitelist of
// characters it will encode.
@@ -53,18 +53,27 @@ static const std::unordered_set<char> SpecialCharacters = {
'\x1F', '\x7F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '{', '[', ']',
'^'};
// Encodes the input string as a URL-encoded string based on UTF-8.
// If 'hive_compat' is set to true, the string is encoded in a Hive-compatible way;
// otherwise, a more standard URL encoding is used, similar to the URLEncoder.encode()
// method in Java.
static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) {
stringstream ss;
// "uppercase" and "hex" only affect the insertion of integers, not that of char values.
ss << uppercase << hex << setfill('0');
for (char ch : std::string_view(in, in_len)) {
// Escape the character iff a) we are in Hive-compat mode and the
// character is in the Hive whitelist or b) we are not in Hive-compat mode and
// the character is not alphanumeric and it is not one of the characters specifically
// excluded from escaping (see ShouldNotEscape()).
// Escape the character iff
// a) we are in Hive-compat mode and the character is in the Hive whitelist or
// b) we are not in Hive-compat mode and the character is not alphanumeric
// and it is not safe to use in URLs (see IsUrlSafe()).
if ((hive_compat && SpecialCharacters.count(ch) > 0) || (!hive_compat &&
!isalnum(static_cast<unsigned char>(ch)) && !ShouldNotEscape(ch))) {
ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch));
!isalnum(static_cast<unsigned char>(ch)) && !IsUrlSafe(ch))) {
// Iff we are not in Hive-compat mode, we encode space as '+'.
if (!hive_compat && ch == ' ') {
ss << '+';
} else {
ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch));
}
} else {
ss << ch;
}

View File

@@ -427,11 +427,11 @@ INT,BIGINT,DECIMAL,STRING
---- QUERY
show files in multi_col_truncate;
---- RESULTS
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick brown/i_trunc=0/b_trunc=11/d_trunc=11111.100000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick brown/i_trunc=0/b_trunc=220/d_trunc=421.000000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick brown/i_trunc=5/b_trunc=330/d_trunc=113211.200000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick fox b/i_trunc=5/b_trunc=440/d_trunc=1111154.100000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick impal/i_trunc=15/b_trunc=550/d_trunc=9999913.200000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=0/b_trunc=11/d_trunc=11111.100000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=0/b_trunc=220/d_trunc=421.000000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=5/b_trunc=330/d_trunc=113211.200000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+fox\+b/i_trunc=5/b_trunc=440/d_trunc=1111154.100000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+impal/i_trunc=15/b_trunc=550/d_trunc=9999913.200000/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=__HIVE_DEFAULT_PARTITION__/i_trunc=__HIVE_DEFAULT_PARTITION__/b_trunc=__HIVE_DEFAULT_PARTITION__/d_trunc=__HIVE_DEFAULT_PARTITION__/.*.parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
@@ -1010,7 +1010,7 @@ STRING,BIGINT,DECIMAL,TIMESTAMP,DATE
show files in mixed_and_shuffled;
---- RESULTS
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1971-07-01/da_year=1971/s_trunc=green/b_bucket=1/de_trunc=71.00/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1999-09-09/da_year=1999/s_trunc=pink /b_bucket=1/de_trunc=9.00/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1999-09-09/da_year=1999/s_trunc=pink\+/b_bucket=1/de_trunc=9.00/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2020-01-06/da_year=2020/s_trunc=quick/b_bucket=1/de_trunc=3333.00/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2021-01-01/da_year=2021/s_trunc=quick/b_bucket=1/de_trunc=543.00/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2021-01-01/da_year=2021/s_trunc=quick/b_bucket=2/de_trunc=123.00/.*.parq','.*','','$ERASURECODE_POLICY'
@@ -1088,3 +1088,59 @@ where da is not null;
---- TYPES
STRING,BIGINT,DECIMAL,TIMESTAMP,DATE
====
---- QUERY
# Test truncate partition transform with Unicode strings
create table unicode_truncate (s string)
partitioned by spec (truncate(5, s))
stored as iceberg;
====
---- QUERY
insert into unicode_truncate values
('impala'),
('árvíztűrőtükörfúrógép'),
('árvíztűrő'),
('űűű'),
('你们好hello');
select * from unicode_truncate;
---- RESULTS: RAW_STRING
'impala'
'árvíztűrőtükörfúrógép'
'árvíztűrő'
'űűű'
'你们好hello'
---- TYPES
STRING
====
---- QUERY
show files in unicode_truncate;
---- RESULTS
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=impal/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%C3%A1rv%C3%ADz/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%C5%B1%C5%B1%C5%B1/.*.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%E4%BD%A0%E4%BB%AC%E5%A5%BDhe/.*.parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
select * from unicode_truncate
where s like "árvíz%";
---- RESULTS: RAW_STRING
'árvíztűrőtükörfúrógép'
'árvíztűrő'
---- TYPES
STRING
---- RUNTIME_PROFILE
aggregation(SUM, RowsRead): 2
aggregation(SUM, NumRowGroups): 4
====
---- QUERY
select * from unicode_truncate
where s = "űűű";
---- RESULTS: RAW_STRING
'űűű'
---- TYPES
STRING
---- RUNTIME_PROFILE
aggregation(SUM, RowsRead): 1
aggregation(SUM, NumRowGroups): 1
====

View File

@@ -311,6 +311,25 @@ select * from unicode_partition_values;
INT, STRING
====
---- QUERY
show files in unicode_partition_values;
---- RESULTS: RAW_STRING
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运/.*\.parq','.*','p=运','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运营业务数据1234567890!@%23\$%25%5E&%2A\(\)%7B}%5B%5D/.*\.parq','.*','p=运营业务数据1234567890!@%23\$%25%5E&%2A\(\)%7B}%5B%5D','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运营业务数据/.*\.parq','.*','p=运营业务数据','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
select * from unicode_partition_values
where p like '运%';
---- RESULTS: RAW_STRING
0,'运'
0,'运营业务数据'
0,'运营业务数据1234567890!@#$%^&*(){}[]'
---- TYPES
INT, STRING
====
---- QUERY
create table unicode_partition_values_iceberg (id int, p string) partitioned by spec (identity(p)) stored by iceberg;
---- RESULTS
'Table has been created.'
@@ -326,4 +345,26 @@ select * from unicode_partition_values_iceberg;
0,'运营业务数据1234567890!@#$%^&*(){}[]'
---- TYPES
INT, STRING
====
---- QUERY
show files in unicode_partition_values_iceberg;
---- RESULTS
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90/.*\.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90%E8%90%A5%E4%B8%9A%E5%8A%A1%E6%95%B0%E6%8D%AE1234567890%21%40%23%24%25%5E%26\*%28%29%7B%7D%5B%5D/.*\.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90%E8%90%A5%E4%B8%9A%E5%8A%A1%E6%95%B0%E6%8D%AE/.*\.parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
insert into unicode_partition_values_iceberg values (2, '运营业务数据');
select * from unicode_partition_values_iceberg
where p = '运营业务数据';
---- RESULTS: RAW_STRING
0,'运营业务数据'
2,'运营业务数据'
---- TYPES
INT, STRING
---- RUNTIME_PROFILE
aggregation(SUM, RowsRead): 2
aggregation(SUM, NumRowGroups): 2
====

View File

@@ -333,7 +333,7 @@ class TestInsertPartKey(ImpalaTestSuite):
"\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \
"\x1C\x1D\x1E\x1F\"\x7F'%*/:=?\\{[]#^"
part_dir = "p=SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%" \
"10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25%2A" \
"10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25*" \
"%2F%3A%3D%3F%5C%7B%5B%5D%23%5E"
show_part_value = "SpecialCharacters\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006" \
"\\u0007\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F\\u0010" \