IMPALA-258: Make partition key string encoding Hive-compatible

This commit is contained in:
Henry Robinson
2013-04-14 22:26:37 -07:00
parent abafcf81ff
commit 7d2c47ad72
5 changed files with 72 additions and 23 deletions

View File

@@ -223,7 +223,9 @@ void HdfsTableSink::BuildHdfsFileNames(OutputPartition* output_partition) {
// with 'show partitions <tbl>' in Hive, followed by a select
// from a decoded partition key value.
string encoded_str;
UrlEncode(value_str, &encoded_str);
// The final parameter forces compatibility with Hive, which
// doesn't URL-encode every character.
UrlEncode(value_str, &encoded_str, true);
common_suffix << encoded_str;
}
common_suffix << "/";

View File

@@ -25,14 +25,14 @@ namespace impala {
// Tests encoding/decoding of input. If expected_encoded is non-empty, the
// encoded string is validated against it.
void TestUrl(const string& input, const string& expected_encoded) {
void TestUrl(const string& input, const string& expected_encoded, bool hive_compat) {
string intermediate;
UrlEncode(input, &intermediate);
UrlEncode(input, &intermediate, hive_compat);
string output;
if (!expected_encoded.empty()) {
EXPECT_EQ(intermediate, expected_encoded);
}
EXPECT_TRUE(UrlDecode(intermediate, &output));
EXPECT_TRUE(UrlDecode(intermediate, &output, hive_compat));
EXPECT_EQ(input, output);
// Convert string to vector and try that also
@@ -40,7 +40,7 @@ void TestUrl(const string& input, const string& expected_encoded) {
input_vector.resize(input.size());
memcpy(&input_vector[0], input.c_str(), input.size());
string intermediate2;
UrlEncode(input_vector, &intermediate2);
UrlEncode(input_vector, &intermediate2, hive_compat);
EXPECT_EQ(intermediate, intermediate2);
}
@@ -67,15 +67,22 @@ void TestBase64(const string& input, const string& expected_encoded) {
// same that come out.
TEST(UrlCodingTest, Basic) {
string input = "ABCDEFGHIJKLMNOPQRSTUWXYZ1234567890~!@#$%^&*()<>?,./:\";'{}|[]\\_+-=";
TestUrl(input, "");
TestUrl(input, "", false);
TestUrl(input, "", true);
}
TEST(UrlCodingTest, HiveExceptions) {
TestUrl(" +", " +", true);
}
TEST(UrlCodingTest, BlankString) {
TestUrl("", "");
TestUrl("", "", false);
TestUrl("", "", true);
}
TEST(UrlCodingTest, PathSeparators) {
TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F");
TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", false);
TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", true);
}
TEST(Base64Test, Basic) {

View File

@@ -31,38 +31,53 @@ using namespace boost::archive::iterators;
namespace impala {
static inline void UrlEncode(const char* in, int in_len, string* out) {
// Hive selectively encodes characters. This is the whitelist of
// characters it will encode.
// See common/src/java/org/apache/hadoop/hive/common/FileUtils.java
// in the Hive source code for the source of this list.
static function<bool (char)> HiveShouldEscape = is_any_of("\"#%\\*/:=?\u00FF");
// It is more convenient to maintain the complement of the set of
// characters to escape when not in Hive-compat mode.
static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) {
(*out).reserve(in_len);
stringstream ss;
for (int i = 0; i < in_len; ++i) {
const char ch = in[i];
if (isalnum(ch) || ch == '-' || ch == '_' || ch == '.' || ch == '~') {
ss << ch;
} else {
// Escape the character iff a) we are in Hive-compat mode and the
// character is in the Hive whitelist or b) we are not in
// Hive-compat mode, and the character is not alphanumeric or one
// of the four commonly excluded characters.
if ((hive_compat && HiveShouldEscape(ch)) ||
(!hive_compat && !(isalnum(ch) || ShouldNotEscape(ch)))) {
ss << '%' << uppercase << hex << static_cast<uint32_t>(ch);
} else {
ss << ch;
}
}
(*out) = ss.str();
}
void UrlEncode(const vector<uint8_t>& in, string* out) {
void UrlEncode(const vector<uint8_t>& in, string* out, bool hive_compat) {
if (in.empty()) {
*out = "";
} else {
UrlEncode(reinterpret_cast<const char*>(&in[0]), in.size(), out);
UrlEncode(reinterpret_cast<const char*>(&in[0]), in.size(), out, hive_compat);
}
}
void UrlEncode(const string& in, string* out) {
UrlEncode(in.c_str(), in.size(), out);
void UrlEncode(const string& in, string* out, bool hive_compat) {
UrlEncode(in.c_str(), in.size(), out, hive_compat);
}
// Adapted from
// http://www.boost.org/doc/libs/1_40_0/doc/html/boost_asio/
// example/http/server3/request_handler.cpp
// See http://www.boost.org/LICENSE_1_0.txt for license for this method.
bool UrlDecode(const string& in, string* out) {
bool UrlDecode(const string& in, string* out, bool hive_compat) {
out->clear();
out->reserve(in.size());
for (size_t i = 0; i < in.size(); ++i) {
@@ -79,7 +94,7 @@ bool UrlDecode(const string& in, string* out) {
} else {
return false;
}
} else if (in[i] == '+') {
} else if (!hive_compat && in[i] == '+') { // Hive does not encode ' ' as '+'
(*out) += ' ';
} else {
(*out) += in[i];
@@ -95,7 +110,7 @@ static inline void Base64Encode(const char* in, int in_len, stringstream* out) {
copy(base64_encode(in), base64_encode(in + in_len), ostream_iterator<char>(*out));
int bytes_written = out->tellp() - len_before;
// Pad with = to make it valid base64 encoded string
int num_pad = bytes_written % 4;
int num_pad = bytes_written % 4;
if (num_pad != 0) {
num_pad = 4 - num_pad;
for (int i = 0; i < num_pad; ++i) {
@@ -138,7 +153,7 @@ bool Base64Decode(const string& in, string* out) {
binary_from_base64<string::const_iterator> , 8, 6> base64_decode;
string tmp = in;
// Replace padding with base64 encoded NULL
replace(tmp.begin(), tmp.end(), '=', 'A');
replace(tmp.begin(), tmp.end(), '=', 'A');
try {
*out = string(base64_decode(tmp.begin()), base64_decode(tmp.end()));
} catch(std::exception& e) {

View File

@@ -23,12 +23,19 @@ namespace impala {
// Utility method to URL-encode a string (that is, replace special
// characters with %<hex value in ascii>).
void UrlEncode(const std::string& in, std::string* out);
void UrlEncode(const std::vector<uint8_t>& in, std::string* out);
// The optional parameter hive_compat controls whether we mimic Hive's
// behaviour when encoding a string, which is only to encode certain
// characters (excluding, e.g., ' ')
void UrlEncode(const std::string& in, std::string* out, bool hive_compat = false);
void UrlEncode(const std::vector<uint8_t>& in, std::string* out,
bool hive_compat = false);
// Utility method to decode a string that was URL-encoded. Returns
// true unless the string could not be correctly decoded.
bool UrlDecode(const std::string& in, std::string* out);
// The optional parameter hive_compat controls whether or not we treat
// the strings as encoded by Hive, which means selectively ignoring
// certain characters like ' '.
bool UrlDecode(const std::string& in, std::string* out, bool hive_compat = false);
// Utility method to encode input as base-64 encoded. This is not
// very performant (multiple string copies) and should not be used

View File

@@ -454,3 +454,21 @@ string, string
---- RESULTS
'value','/\%.'
====
---- QUERY
# static partition insert into string-partitioned table with non-escaped special characters
# (Hive chooses not to escape + and ' ')
INSERT INTO TABLE insert_string_partitioned PARTITION(s2="_.~ +")
SELECT "value" FROM alltypessmall LIMIT 1;
---- SETUP
DROP PARTITIONS insert_string_partitioned
---- RESULTS
s2=_.~ +/: 1
====
---- QUERY
# select with unencoded partition key
SELECT * FROM insert_string_partitioned;
---- TYPES
string, string
---- RESULTS
'value','_.~ +'
====