mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
IMPALA-14237: Fix Iceberg partition values encoding
This patch modifies the string overload of IcebergFunctions::TruncatePartitionTransform so that it always handles strings as UTF-8-encoded ones, because the Iceberg specification states that that strings are UTF-8 encoded. Also, for an Iceberg table UrlEncode is called in not the Hive-compatible way, rather than the standard way, similar to Java's URLEncoder.encode() (which the Iceberg API also uses) to conform with existing practices by Hive, Spark and Trino. This included a change in the set of characters which are not escaped to follow the URL Standard's application/x-www-form-urlencoded format. [1] Also renamed it from ShouldNotEscape to IsUrlSafe for better readability. Testing: * add and extend e2e tests to check partitions with Unicode characters * add be tests to coding-util-test.cc [1]: https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set Change-Id: Iabb39727f6dd49b76c918bcd6b3ec62532555755 Reviewed-on: http://gerrit.cloudera.org:8080/23190 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
062ba4071a
commit
321429eac6
@@ -107,7 +107,12 @@ string TableSinkBase::GetPartitionName(int i) {
|
||||
|
||||
string TableSinkBase::UrlEncodePartitionValue(const string& raw_str) {
|
||||
string encoded_str;
|
||||
UrlEncode(raw_str, &encoded_str, true);
|
||||
if (IsIceberg()) {
|
||||
// Iceberg partition values should be URL encoded, but not Hive compatible way.
|
||||
UrlEncode(raw_str, &encoded_str, false);
|
||||
} else {
|
||||
UrlEncode(raw_str, &encoded_str, true);
|
||||
}
|
||||
return encoded_str.empty() ? table_desc_->null_partition_key_value() : encoded_str;
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#include "common/compiler-util.h"
|
||||
#include "common/logging.h"
|
||||
#include "exprs/string-functions.h"
|
||||
#include "runtime/timestamp-value.inline.h"
|
||||
#include "thirdparty/murmurhash/MurmurHash3.h"
|
||||
#include "udf/udf-internal.h"
|
||||
@@ -102,7 +103,9 @@ StringVal IcebergFunctions::TruncatePartitionTransform(FunctionContext* ctx,
|
||||
const StringVal& input, const IntVal& width) {
|
||||
if (!CheckInputsAndSetError(ctx, input, width)) return StringVal::null();
|
||||
if (input.len <= width.val) return input;
|
||||
return StringVal::CopyFrom(ctx, input.ptr, width.val);
|
||||
// String handled as UTF8 regardless of utf8_mode, because Iceberg spec states that
|
||||
// character strings must be stored as UTF-8 encoded byte arrays.
|
||||
return StringFunctions::Utf8Substring(ctx, input, 1, width.val);
|
||||
}
|
||||
|
||||
template<typename T, typename W>
|
||||
|
||||
@@ -117,14 +117,39 @@ TEST(UrlCodingTest, PathSeparators) {
|
||||
string encoded_test_path = "%2Fhome%2Fimpala%2Fdirectory%2F";
|
||||
TestUrl(test_path, encoded_test_path, false);
|
||||
TestUrl(test_path, encoded_test_path, true);
|
||||
string test = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r\x0E\x0F\x10"
|
||||
"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\"#%'"
|
||||
"*/:=?\\{[]^";
|
||||
string encoded_test = "SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"
|
||||
"%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%7F%22%23%25"
|
||||
"%27%2A%2F%3A%3D%3F%5C%7B%5B%5D%5E";
|
||||
TestUrl(test, encoded_test, false);
|
||||
TestUrl(test, encoded_test, true);
|
||||
}
|
||||
|
||||
// Test URL encoding of the ASCII table, character values from 1 to 127.
|
||||
TEST(UrlCodingTest, AsciiCharacters) {
|
||||
string raw = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D"
|
||||
"\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19"
|
||||
"\x1A\x1B\x1C\x1D\x1E\x1F !\"#$%&'()*+,-./"
|
||||
"0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7F";
|
||||
string hive_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
|
||||
"%0E%0F%10%11%12%13%14%15%16%17%18%19"
|
||||
"%1A%1B%1C%1D%1E%1F !%22%23$%25&%27()%2A+,-.%2F"
|
||||
"0123456789%3A;<%3D>%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"%5B%5C%5D%5E_`abcdefghijklmnopqrstuvwxyz%7B|}~%7F";
|
||||
TestUrl(raw, hive_encoded, true);
|
||||
string url_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
|
||||
"%0E%0F%10%11%12%13%14%15%16%17%18%19"
|
||||
"%1A%1B%1C%1D%1E%1F+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F"
|
||||
"0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D%7E%7F";
|
||||
TestUrl(raw, url_encoded, false);
|
||||
}
|
||||
|
||||
// Test a few unicode characters that are not in the ASCII table.
|
||||
TEST(UrlCodingTest, UnicodeCharacters) {
|
||||
string raw = "árvíztűrőtükörfúrógép 你们好 აბგ";
|
||||
string hive_encoded = "árvíztűrőtükörfúrógép 你们好 აბგ";
|
||||
TestUrl(raw, hive_encoded, true);
|
||||
string url_encoded = "%C3%A1rv%C3%ADzt%C5%B1r%C5%91"
|
||||
"t%C3%BCk%C3%B6rf%C3%BAr%C3%B3g%C3%A9p"
|
||||
"+%E4%BD%A0%E4%BB%AC%E5%A5%BD"
|
||||
"+%E1%83%90%E1%83%91%E1%83%92";
|
||||
TestUrl(raw, url_encoded, false);
|
||||
}
|
||||
|
||||
TEST(Base64Test, Basic) {
|
||||
|
||||
@@ -38,9 +38,9 @@ using std::uppercase;
|
||||
|
||||
namespace impala {
|
||||
|
||||
// It is more convenient to maintain the complement of the set of
|
||||
// characters to escape when not in Hive-compat mode.
|
||||
static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
|
||||
// It is more convenient to maintain the set of characters that are safe to use
|
||||
// directly in URLs without escaping
|
||||
static function<bool (char)> IsUrlSafe = is_any_of(".-*_");
|
||||
|
||||
// Hive selectively encodes characters. This is the whitelist of
|
||||
// characters it will encode.
|
||||
@@ -53,18 +53,27 @@ static const std::unordered_set<char> SpecialCharacters = {
|
||||
'\x1F', '\x7F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '{', '[', ']',
|
||||
'^'};
|
||||
|
||||
// Encodes the input string as a URL-encoded string based on UTF-8.
|
||||
// If 'hive_compat' is set to true, the string is encoded in a Hive-compatible way;
|
||||
// otherwise, a more standard URL encoding is used, similar to the URLEncoder.encode()
|
||||
// method in Java.
|
||||
static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) {
|
||||
stringstream ss;
|
||||
// "uppercase" and "hex" only affect the insertion of integers, not that of char values.
|
||||
ss << uppercase << hex << setfill('0');
|
||||
for (char ch : std::string_view(in, in_len)) {
|
||||
// Escape the character iff a) we are in Hive-compat mode and the
|
||||
// character is in the Hive whitelist or b) we are not in Hive-compat mode and
|
||||
// the character is not alphanumeric and it is not one of the characters specifically
|
||||
// excluded from escaping (see ShouldNotEscape()).
|
||||
// Escape the character iff
|
||||
// a) we are in Hive-compat mode and the character is in the Hive whitelist or
|
||||
// b) we are not in Hive-compat mode and the character is not alphanumeric
|
||||
// and it is not safe to use in URLs (see IsUrlSafe()).
|
||||
if ((hive_compat && SpecialCharacters.count(ch) > 0) || (!hive_compat &&
|
||||
!isalnum(static_cast<unsigned char>(ch)) && !ShouldNotEscape(ch))) {
|
||||
ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch));
|
||||
!isalnum(static_cast<unsigned char>(ch)) && !IsUrlSafe(ch))) {
|
||||
// Iff we are not in Hive-compat mode, we encode space as '+'.
|
||||
if (!hive_compat && ch == ' ') {
|
||||
ss << '+';
|
||||
} else {
|
||||
ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch));
|
||||
}
|
||||
} else {
|
||||
ss << ch;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user