IMPALA-14237: Fix Iceberg partition values encoding

This patch modifies the string overload of
IcebergFunctions::TruncatePartitionTransform so that it always handles
strings as UTF-8-encoded ones, because the Iceberg specification states
that that strings are UTF-8 encoded.

Also, for an Iceberg table UrlEncode is called in not the
Hive-compatible way, rather than the standard way, similar to Java's
URLEncoder.encode() (which the Iceberg API also uses) to conform with
existing practices by Hive, Spark and Trino. This included a change in
the set of characters which are not escaped to follow the URL Standard's
application/x-www-form-urlencoded format. [1] Also renamed it from
ShouldNotEscape to IsUrlSafe for better readability.

Testing:
 * add and extend e2e tests to check partitions with Unicode characters
 * add be tests to coding-util-test.cc

[1]: https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set

Change-Id: Iabb39727f6dd49b76c918bcd6b3ec62532555755
Reviewed-on: http://gerrit.cloudera.org:8080/23190
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Daniel Vanko
2025-07-18 15:56:17 +02:00
committed by Impala Public Jenkins
parent 062ba4071a
commit 321429eac6
7 changed files with 165 additions and 26 deletions

View File

@@ -107,7 +107,12 @@ string TableSinkBase::GetPartitionName(int i) {
string TableSinkBase::UrlEncodePartitionValue(const string& raw_str) {
string encoded_str;
UrlEncode(raw_str, &encoded_str, true);
if (IsIceberg()) {
// Iceberg partition values should be URL encoded, but not Hive compatible way.
UrlEncode(raw_str, &encoded_str, false);
} else {
UrlEncode(raw_str, &encoded_str, true);
}
return encoded_str.empty() ? table_desc_->null_partition_key_value() : encoded_str;
}

View File

@@ -22,6 +22,7 @@
#include "common/compiler-util.h"
#include "common/logging.h"
#include "exprs/string-functions.h"
#include "runtime/timestamp-value.inline.h"
#include "thirdparty/murmurhash/MurmurHash3.h"
#include "udf/udf-internal.h"
@@ -102,7 +103,9 @@ StringVal IcebergFunctions::TruncatePartitionTransform(FunctionContext* ctx,
const StringVal& input, const IntVal& width) {
if (!CheckInputsAndSetError(ctx, input, width)) return StringVal::null();
if (input.len <= width.val) return input;
return StringVal::CopyFrom(ctx, input.ptr, width.val);
// String handled as UTF8 regardless of utf8_mode, because Iceberg spec states that
// character strings must be stored as UTF-8 encoded byte arrays.
return StringFunctions::Utf8Substring(ctx, input, 1, width.val);
}
template<typename T, typename W>

View File

@@ -117,14 +117,39 @@ TEST(UrlCodingTest, PathSeparators) {
string encoded_test_path = "%2Fhome%2Fimpala%2Fdirectory%2F";
TestUrl(test_path, encoded_test_path, false);
TestUrl(test_path, encoded_test_path, true);
string test = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r\x0E\x0F\x10"
"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\"#%'"
"*/:=?\\{[]^";
string encoded_test = "SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"
"%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%7F%22%23%25"
"%27%2A%2F%3A%3D%3F%5C%7B%5B%5D%5E";
TestUrl(test, encoded_test, false);
TestUrl(test, encoded_test, true);
}
// Test URL encoding of the ASCII table, character values from 1 to 127.
TEST(UrlCodingTest, AsciiCharacters) {
string raw = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D"
"\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19"
"\x1A\x1B\x1C\x1D\x1E\x1F !\"#$%&'()*+,-./"
"0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7F";
string hive_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
"%0E%0F%10%11%12%13%14%15%16%17%18%19"
"%1A%1B%1C%1D%1E%1F !%22%23$%25&%27()%2A+,-.%2F"
"0123456789%3A;<%3D>%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"%5B%5C%5D%5E_`abcdefghijklmnopqrstuvwxyz%7B|}~%7F";
TestUrl(raw, hive_encoded, true);
string url_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
"%0E%0F%10%11%12%13%14%15%16%17%18%19"
"%1A%1B%1C%1D%1E%1F+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F"
"0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D%7E%7F";
TestUrl(raw, url_encoded, false);
}
// Test a few unicode characters that are not in the ASCII table.
TEST(UrlCodingTest, UnicodeCharacters) {
string raw = "árvíztűrőtükörfúrógép 你们好 აბგ";
string hive_encoded = "árvíztűrőtükörfúrógép 你们好 აბგ";
TestUrl(raw, hive_encoded, true);
string url_encoded = "%C3%A1rv%C3%ADzt%C5%B1r%C5%91"
"t%C3%BCk%C3%B6rf%C3%BAr%C3%B3g%C3%A9p"
"+%E4%BD%A0%E4%BB%AC%E5%A5%BD"
"+%E1%83%90%E1%83%91%E1%83%92";
TestUrl(raw, url_encoded, false);
}
TEST(Base64Test, Basic) {

View File

@@ -38,9 +38,9 @@ using std::uppercase;
namespace impala {
// It is more convenient to maintain the complement of the set of
// characters to escape when not in Hive-compat mode.
static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
// It is more convenient to maintain the set of characters that are safe to use
// directly in URLs without escaping
static function<bool (char)> IsUrlSafe = is_any_of(".-*_");
// Hive selectively encodes characters. This is the whitelist of
// characters it will encode.
@@ -53,18 +53,27 @@ static const std::unordered_set<char> SpecialCharacters = {
'\x1F', '\x7F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '{', '[', ']',
'^'};
// Encodes the input string as a URL-encoded string based on UTF-8.
// If 'hive_compat' is set to true, the string is encoded in a Hive-compatible way;
// otherwise, a more standard URL encoding is used, similar to the URLEncoder.encode()
// method in Java.
static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) {
stringstream ss;
// "uppercase" and "hex" only affect the insertion of integers, not that of char values.
ss << uppercase << hex << setfill('0');
for (char ch : std::string_view(in, in_len)) {
// Escape the character iff a) we are in Hive-compat mode and the
// character is in the Hive whitelist or b) we are not in Hive-compat mode and
// the character is not alphanumeric and it is not one of the characters specifically
// excluded from escaping (see ShouldNotEscape()).
// Escape the character iff
// a) we are in Hive-compat mode and the character is in the Hive whitelist or
// b) we are not in Hive-compat mode and the character is not alphanumeric
// and it is not safe to use in URLs (see IsUrlSafe()).
if ((hive_compat && SpecialCharacters.count(ch) > 0) || (!hive_compat &&
!isalnum(static_cast<unsigned char>(ch)) && !ShouldNotEscape(ch))) {
ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch));
!isalnum(static_cast<unsigned char>(ch)) && !IsUrlSafe(ch))) {
// Iff we are not in Hive-compat mode, we encode space as '+'.
if (!hive_compat && ch == ' ') {
ss << '+';
} else {
ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch));
}
} else {
ss << ch;
}