mirror of
https://github.com/apache/impala.git
synced 2026-01-04 18:00:57 -05:00
IMPALA-258: Make partition key string encoding Hive-compatible
This commit is contained in:
@@ -223,7 +223,9 @@ void HdfsTableSink::BuildHdfsFileNames(OutputPartition* output_partition) {
|
||||
// with 'show partitions <tbl>' in Hive, followed by a select
|
||||
// from a decoded partition key value.
|
||||
string encoded_str;
|
||||
UrlEncode(value_str, &encoded_str);
|
||||
// The final parameter forces compatibility with Hive, which
|
||||
// doesn't URL-encode every character.
|
||||
UrlEncode(value_str, &encoded_str, true);
|
||||
common_suffix << encoded_str;
|
||||
}
|
||||
common_suffix << "/";
|
||||
|
||||
@@ -25,14 +25,14 @@ namespace impala {
|
||||
|
||||
// Tests encoding/decoding of input. If expected_encoded is non-empty, the
|
||||
// encoded string is validated against it.
|
||||
void TestUrl(const string& input, const string& expected_encoded) {
|
||||
void TestUrl(const string& input, const string& expected_encoded, bool hive_compat) {
|
||||
string intermediate;
|
||||
UrlEncode(input, &intermediate);
|
||||
UrlEncode(input, &intermediate, hive_compat);
|
||||
string output;
|
||||
if (!expected_encoded.empty()) {
|
||||
EXPECT_EQ(intermediate, expected_encoded);
|
||||
}
|
||||
EXPECT_TRUE(UrlDecode(intermediate, &output));
|
||||
EXPECT_TRUE(UrlDecode(intermediate, &output, hive_compat));
|
||||
EXPECT_EQ(input, output);
|
||||
|
||||
// Convert string to vector and try that also
|
||||
@@ -40,7 +40,7 @@ void TestUrl(const string& input, const string& expected_encoded) {
|
||||
input_vector.resize(input.size());
|
||||
memcpy(&input_vector[0], input.c_str(), input.size());
|
||||
string intermediate2;
|
||||
UrlEncode(input_vector, &intermediate2);
|
||||
UrlEncode(input_vector, &intermediate2, hive_compat);
|
||||
EXPECT_EQ(intermediate, intermediate2);
|
||||
}
|
||||
|
||||
@@ -67,15 +67,22 @@ void TestBase64(const string& input, const string& expected_encoded) {
|
||||
// same that come out.
|
||||
TEST(UrlCodingTest, Basic) {
|
||||
string input = "ABCDEFGHIJKLMNOPQRSTUWXYZ1234567890~!@#$%^&*()<>?,./:\";'{}|[]\\_+-=";
|
||||
TestUrl(input, "");
|
||||
TestUrl(input, "", false);
|
||||
TestUrl(input, "", true);
|
||||
}
|
||||
|
||||
TEST(UrlCodingTest, HiveExceptions) {
|
||||
TestUrl(" +", " +", true);
|
||||
}
|
||||
|
||||
TEST(UrlCodingTest, BlankString) {
|
||||
TestUrl("", "");
|
||||
TestUrl("", "", false);
|
||||
TestUrl("", "", true);
|
||||
}
|
||||
|
||||
TEST(UrlCodingTest, PathSeparators) {
|
||||
TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F");
|
||||
TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", false);
|
||||
TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", true);
|
||||
}
|
||||
|
||||
TEST(Base64Test, Basic) {
|
||||
|
||||
@@ -31,38 +31,53 @@ using namespace boost::archive::iterators;
|
||||
|
||||
namespace impala {
|
||||
|
||||
static inline void UrlEncode(const char* in, int in_len, string* out) {
|
||||
// Hive selectively encodes characters. This is the whitelist of
|
||||
// characters it will encode.
|
||||
// See common/src/java/org/apache/hadoop/hive/common/FileUtils.java
|
||||
// in the Hive source code for the source of this list.
|
||||
static function<bool (char)> HiveShouldEscape = is_any_of("\"#%\\*/:=?\u00FF");
|
||||
|
||||
// It is more convenient to maintain the complement of the set of
|
||||
// characters to escape when not in Hive-compat mode.
|
||||
static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
|
||||
|
||||
static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) {
|
||||
(*out).reserve(in_len);
|
||||
stringstream ss;
|
||||
for (int i = 0; i < in_len; ++i) {
|
||||
const char ch = in[i];
|
||||
if (isalnum(ch) || ch == '-' || ch == '_' || ch == '.' || ch == '~') {
|
||||
ss << ch;
|
||||
} else {
|
||||
// Escape the character iff a) we are in Hive-compat mode and the
|
||||
// character is in the Hive whitelist or b) we are not in
|
||||
// Hive-compat mode, and the character is not alphanumeric or one
|
||||
// of the four commonly excluded characters.
|
||||
if ((hive_compat && HiveShouldEscape(ch)) ||
|
||||
(!hive_compat && !(isalnum(ch) || ShouldNotEscape(ch)))) {
|
||||
ss << '%' << uppercase << hex << static_cast<uint32_t>(ch);
|
||||
} else {
|
||||
ss << ch;
|
||||
}
|
||||
}
|
||||
|
||||
(*out) = ss.str();
|
||||
}
|
||||
|
||||
void UrlEncode(const vector<uint8_t>& in, string* out) {
|
||||
void UrlEncode(const vector<uint8_t>& in, string* out, bool hive_compat) {
|
||||
if (in.empty()) {
|
||||
*out = "";
|
||||
} else {
|
||||
UrlEncode(reinterpret_cast<const char*>(&in[0]), in.size(), out);
|
||||
UrlEncode(reinterpret_cast<const char*>(&in[0]), in.size(), out, hive_compat);
|
||||
}
|
||||
}
|
||||
|
||||
void UrlEncode(const string& in, string* out) {
|
||||
UrlEncode(in.c_str(), in.size(), out);
|
||||
void UrlEncode(const string& in, string* out, bool hive_compat) {
|
||||
UrlEncode(in.c_str(), in.size(), out, hive_compat);
|
||||
}
|
||||
|
||||
// Adapted from
|
||||
// http://www.boost.org/doc/libs/1_40_0/doc/html/boost_asio/
|
||||
// example/http/server3/request_handler.cpp
|
||||
// See http://www.boost.org/LICENSE_1_0.txt for license for this method.
|
||||
bool UrlDecode(const string& in, string* out) {
|
||||
bool UrlDecode(const string& in, string* out, bool hive_compat) {
|
||||
out->clear();
|
||||
out->reserve(in.size());
|
||||
for (size_t i = 0; i < in.size(); ++i) {
|
||||
@@ -79,7 +94,7 @@ bool UrlDecode(const string& in, string* out) {
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (in[i] == '+') {
|
||||
} else if (!hive_compat && in[i] == '+') { // Hive does not encode ' ' as '+'
|
||||
(*out) += ' ';
|
||||
} else {
|
||||
(*out) += in[i];
|
||||
@@ -95,7 +110,7 @@ static inline void Base64Encode(const char* in, int in_len, stringstream* out) {
|
||||
copy(base64_encode(in), base64_encode(in + in_len), ostream_iterator<char>(*out));
|
||||
int bytes_written = out->tellp() - len_before;
|
||||
// Pad with = to make it valid base64 encoded string
|
||||
int num_pad = bytes_written % 4;
|
||||
int num_pad = bytes_written % 4;
|
||||
if (num_pad != 0) {
|
||||
num_pad = 4 - num_pad;
|
||||
for (int i = 0; i < num_pad; ++i) {
|
||||
@@ -138,7 +153,7 @@ bool Base64Decode(const string& in, string* out) {
|
||||
binary_from_base64<string::const_iterator> , 8, 6> base64_decode;
|
||||
string tmp = in;
|
||||
// Replace padding with base64 encoded NULL
|
||||
replace(tmp.begin(), tmp.end(), '=', 'A');
|
||||
replace(tmp.begin(), tmp.end(), '=', 'A');
|
||||
try {
|
||||
*out = string(base64_decode(tmp.begin()), base64_decode(tmp.end()));
|
||||
} catch(std::exception& e) {
|
||||
|
||||
@@ -23,12 +23,19 @@ namespace impala {
|
||||
|
||||
// Utility method to URL-encode a string (that is, replace special
|
||||
// characters with %<hex value in ascii>).
|
||||
void UrlEncode(const std::string& in, std::string* out);
|
||||
void UrlEncode(const std::vector<uint8_t>& in, std::string* out);
|
||||
// The optional parameter hive_compat controls whether we mimic Hive's
|
||||
// behaviour when encoding a string, which is only to encode certain
|
||||
// characters (excluding, e.g., ' ')
|
||||
void UrlEncode(const std::string& in, std::string* out, bool hive_compat = false);
|
||||
void UrlEncode(const std::vector<uint8_t>& in, std::string* out,
|
||||
bool hive_compat = false);
|
||||
|
||||
// Utility method to decode a string that was URL-encoded. Returns
|
||||
// true unless the string could not be correctly decoded.
|
||||
bool UrlDecode(const std::string& in, std::string* out);
|
||||
// The optional parameter hive_compat controls whether or not we treat
|
||||
// the strings as encoded by Hive, which means selectively ignoring
|
||||
// certain characters like ' '.
|
||||
bool UrlDecode(const std::string& in, std::string* out, bool hive_compat = false);
|
||||
|
||||
// Utility method to encode input as base-64 encoded. This is not
|
||||
// very performant (multiple string copies) and should not be used
|
||||
|
||||
@@ -454,3 +454,21 @@ string, string
|
||||
---- RESULTS
|
||||
'value','/\%.'
|
||||
====
|
||||
---- QUERY
|
||||
# static partition insert into string-partitioned table with non-escaped special characters
|
||||
# (Hive chooses not to escape + and ' ')
|
||||
INSERT INTO TABLE insert_string_partitioned PARTITION(s2="_.~ +")
|
||||
SELECT "value" FROM alltypessmall LIMIT 1;
|
||||
---- SETUP
|
||||
DROP PARTITIONS insert_string_partitioned
|
||||
---- RESULTS
|
||||
s2=_.~ +/: 1
|
||||
====
|
||||
---- QUERY
|
||||
# select with unencoded partition key
|
||||
SELECT * FROM insert_string_partitioned;
|
||||
---- TYPES
|
||||
string, string
|
||||
---- RESULTS
|
||||
'value','_.~ +'
|
||||
====
|
||||
|
||||
Reference in New Issue
Block a user