diff --git a/be/src/exec/hdfs-table-sink.cc b/be/src/exec/hdfs-table-sink.cc index f61d13ad9..01a170117 100644 --- a/be/src/exec/hdfs-table-sink.cc +++ b/be/src/exec/hdfs-table-sink.cc @@ -223,7 +223,9 @@ void HdfsTableSink::BuildHdfsFileNames(OutputPartition* output_partition) { // with 'show partitions ' in Hive, followed by a select // from a decoded partition key value. string encoded_str; - UrlEncode(value_str, &encoded_str); + // The final parameter forces compatibility with Hive, which + // doesn't URL-encode every character. + UrlEncode(value_str, &encoded_str, true); common_suffix << encoded_str; } common_suffix << "/"; diff --git a/be/src/util/url-coding-test.cc b/be/src/util/url-coding-test.cc index a1792ff69..cc98b5f97 100644 --- a/be/src/util/url-coding-test.cc +++ b/be/src/util/url-coding-test.cc @@ -25,14 +25,14 @@ namespace impala { // Tests encoding/decoding of input. If expected_encoded is non-empty, the // encoded string is validated against it. -void TestUrl(const string& input, const string& expected_encoded) { +void TestUrl(const string& input, const string& expected_encoded, bool hive_compat) { string intermediate; - UrlEncode(input, &intermediate); + UrlEncode(input, &intermediate, hive_compat); string output; if (!expected_encoded.empty()) { EXPECT_EQ(intermediate, expected_encoded); } - EXPECT_TRUE(UrlDecode(intermediate, &output)); + EXPECT_TRUE(UrlDecode(intermediate, &output, hive_compat)); EXPECT_EQ(input, output); // Convert string to vector and try that also @@ -40,7 +40,7 @@ void TestUrl(const string& input, const string& expected_encoded) { input_vector.resize(input.size()); memcpy(&input_vector[0], input.c_str(), input.size()); string intermediate2; - UrlEncode(input_vector, &intermediate2); + UrlEncode(input_vector, &intermediate2, hive_compat); EXPECT_EQ(intermediate, intermediate2); } @@ -67,15 +67,22 @@ void TestBase64(const string& input, const string& expected_encoded) { // same that come out. TEST(UrlCodingTest, Basic) { string input = "ABCDEFGHIJKLMNOPQRSTUWXYZ1234567890~!@#$%^&*()<>?,./:\";'{}|[]\\_+-="; - TestUrl(input, ""); + TestUrl(input, "", false); + TestUrl(input, "", true); +} + +TEST(UrlCodingTest, HiveExceptions) { + TestUrl(" +", " +", true); } TEST(UrlCodingTest, BlankString) { - TestUrl("", ""); + TestUrl("", "", false); + TestUrl("", "", true); } TEST(UrlCodingTest, PathSeparators) { - TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F"); + TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", false); + TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", true); } TEST(Base64Test, Basic) { diff --git a/be/src/util/url-coding.cc b/be/src/util/url-coding.cc index 4f3540c4a..9338bf142 100644 --- a/be/src/util/url-coding.cc +++ b/be/src/util/url-coding.cc @@ -31,38 +31,53 @@ using namespace boost::archive::iterators; namespace impala { -static inline void UrlEncode(const char* in, int in_len, string* out) { +// Hive selectively encodes characters. This is the whitelist of +// characters it will encode. +// See common/src/java/org/apache/hadoop/hive/common/FileUtils.java +// in the Hive source code for the source of this list. +static function HiveShouldEscape = is_any_of("\"#%\\*/:=?\u00FF"); + +// It is more convenient to maintain the complement of the set of +// characters to escape when not in Hive-compat mode. +static function ShouldNotEscape = is_any_of("-_.~"); + +static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) { (*out).reserve(in_len); stringstream ss; for (int i = 0; i < in_len; ++i) { const char ch = in[i]; - if (isalnum(ch) || ch == '-' || ch == '_' || ch == '.' || ch == '~') { - ss << ch; - } else { + // Escape the character iff a) we are in Hive-compat mode and the + // character is in the Hive whitelist or b) we are not in + // Hive-compat mode, and the character is not alphanumeric or one + // of the four commonly excluded characters. + if ((hive_compat && HiveShouldEscape(ch)) || + (!hive_compat && !(isalnum(ch) || ShouldNotEscape(ch)))) { ss << '%' << uppercase << hex << static_cast(ch); + } else { + ss << ch; } } (*out) = ss.str(); } -void UrlEncode(const vector& in, string* out) { +void UrlEncode(const vector& in, string* out, bool hive_compat) { if (in.empty()) { *out = ""; } else { - UrlEncode(reinterpret_cast(&in[0]), in.size(), out); + UrlEncode(reinterpret_cast(&in[0]), in.size(), out, hive_compat); } } -void UrlEncode(const string& in, string* out) { - UrlEncode(in.c_str(), in.size(), out); +void UrlEncode(const string& in, string* out, bool hive_compat) { + UrlEncode(in.c_str(), in.size(), out, hive_compat); } // Adapted from // http://www.boost.org/doc/libs/1_40_0/doc/html/boost_asio/ // example/http/server3/request_handler.cpp // See http://www.boost.org/LICENSE_1_0.txt for license for this method. -bool UrlDecode(const string& in, string* out) { +bool UrlDecode(const string& in, string* out, bool hive_compat) { out->clear(); out->reserve(in.size()); for (size_t i = 0; i < in.size(); ++i) { @@ -79,7 +94,7 @@ bool UrlDecode(const string& in, string* out) { } else { return false; } - } else if (in[i] == '+') { + } else if (!hive_compat && in[i] == '+') { // Hive does not encode ' ' as '+' (*out) += ' '; } else { (*out) += in[i]; @@ -95,7 +110,7 @@ static inline void Base64Encode(const char* in, int in_len, stringstream* out) { copy(base64_encode(in), base64_encode(in + in_len), ostream_iterator(*out)); int bytes_written = out->tellp() - len_before; // Pad with = to make it valid base64 encoded string - int num_pad = bytes_written % 4; + int num_pad = bytes_written % 4; if (num_pad != 0) { num_pad = 4 - num_pad; for (int i = 0; i < num_pad; ++i) { @@ -138,7 +153,7 @@ bool Base64Decode(const string& in, string* out) { binary_from_base64 , 8, 6> base64_decode; string tmp = in; // Replace padding with base64 encoded NULL - replace(tmp.begin(), tmp.end(), '=', 'A'); + replace(tmp.begin(), tmp.end(), '=', 'A'); try { *out = string(base64_decode(tmp.begin()), base64_decode(tmp.end())); } catch(std::exception& e) { diff --git a/be/src/util/url-coding.h b/be/src/util/url-coding.h index bcb4de62a..c8fbe901a 100644 --- a/be/src/util/url-coding.h +++ b/be/src/util/url-coding.h @@ -23,12 +23,19 @@ namespace impala { // Utility method to URL-encode a string (that is, replace special // characters with %). -void UrlEncode(const std::string& in, std::string* out); -void UrlEncode(const std::vector& in, std::string* out); +// The optional parameter hive_compat controls whether we mimic Hive's +// behaviour when encoding a string, which is only to encode certain +// characters (excluding, e.g., ' ') +void UrlEncode(const std::string& in, std::string* out, bool hive_compat = false); +void UrlEncode(const std::vector& in, std::string* out, + bool hive_compat = false); // Utility method to decode a string that was URL-encoded. Returns // true unless the string could not be correctly decoded. -bool UrlDecode(const std::string& in, std::string* out); +// The optional parameter hive_compat controls whether or not we treat +// the strings as encoded by Hive, which means selectively ignoring +// certain characters like ' '. +bool UrlDecode(const std::string& in, std::string* out, bool hive_compat = false); // Utility method to encode input as base-64 encoded. This is not // very performant (multiple string copies) and should not be used diff --git a/testdata/workloads/functional-query/queries/QueryTest/insert.test b/testdata/workloads/functional-query/queries/QueryTest/insert.test index 88c88b951..e0514142e 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/insert.test +++ b/testdata/workloads/functional-query/queries/QueryTest/insert.test @@ -454,3 +454,21 @@ string, string ---- RESULTS 'value','/\%.' ==== +---- QUERY +# static partition insert into string-partitioned table with non-escaped special characters +# (Hive chooses not to escape + and ' ') +INSERT INTO TABLE insert_string_partitioned PARTITION(s2="_.~ +") +SELECT "value" FROM alltypessmall LIMIT 1; +---- SETUP +DROP PARTITIONS insert_string_partitioned +---- RESULTS +s2=_.~ +/: 1 +==== +---- QUERY +# select with unencoded partition key +SELECT * FROM insert_string_partitioned; +---- TYPES +string, string +---- RESULTS +'value','_.~ +' +====