diff --git a/be/src/benchmarks/convert-timestamp-benchmark.cc b/be/src/benchmarks/convert-timestamp-benchmark.cc
index a39829ab7..18b14fd47 100644
--- a/be/src/benchmarks/convert-timestamp-benchmark.cc
+++ b/be/src/benchmarks/convert-timestamp-benchmark.cc
@@ -37,8 +37,11 @@
#include "exprs/timezone_db.h"
#include "exprs/timestamp-functions.h"
#include "runtime/datetime-simple-date-format-parser.h"
+#include "runtime/test-env.h"
#include "runtime/timestamp-value.h"
#include "runtime/timestamp-value.inline.h"
+#include "service/fe-support.h"
+#include "service/frontend.h"
#include "util/benchmark.h"
#include "util/cpu-info.h"
#include "util/pretty-printer.h"
@@ -49,118 +52,120 @@
using std::random_device;
using std::mt19937;
using std::uniform_int_distribution;
-using std::thread;
using namespace impala;
using namespace datetime_parse_util;
// Benchmark tests for timestamp time-zone conversions
/*
-Machine Info: Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz
+Machine Info: 12th Gen Intel(R) Core(TM) i9-12900K
UtcToUnixTime: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (glibc) 7.98 8.14 8.3 1X 1X 1X
- (Google/CCTZ) 17.9 18.2 18.5 2.24X 2.24X 2.23X
- (boost) 301 306 311 37.7X 37.5X 37.5X
+ (glibc) 12 12.1 12.2 1X 1X 1X
+ (Google/CCTZ) 25.2 25.5 25.5 2.1X 2.11X 2.09X
+ (boost) 635 643 646 53X 53.3X 52.9X
LocalToUnixTime: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (glibc) 0.717 0.732 0.745 1X 1X 1X
- (Google/CCTZ) 15.3 15.5 15.8 21.3X 21.2X 21.2X
+ (glibc) 0.739 0.745 0.745 1X 1X 1X
+ (Google/CCTZ) 23.2 23.4 23.7 31.4X 31.4X 31.7X
FromUtc: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (boost) 1.6 1.63 1.67 1X 1X 1X
- (Google/CCTZ) 14.5 14.8 15.2 9.06X 9.09X 9.11X
+ (boost) 2.59 2.6 2.6 1X 1X 1X
+ (Google/CCTZ) 24.4 24.7 24.9 9.45X 9.5X 9.58X
ToUtc: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (boost) 1.63 1.67 1.68 1X 1X 1X
- (Google/CCTZ) 8.7 8.9 9.05 5.34X 5.34X 5.38X
+ (boost) 2.6 2.64 2.65 1X 1X 1X
+ (Google/CCTZ) 13.5 13.6 13.7 5.2X 5.16X 5.18X
UtcToLocal: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (glibc) 2.68 2.75 2.8 1X 1X 1X
- (Google/CCTZ) 15 15.2 15.5 5.59X 5.55X 5.53X
+ (glibc) 4.42 4.42 4.44 1X 1X 1X
+ (Google/CCTZ) 25.6 26.6 27 5.78X 6.01X 6.08X
+ (JVM) 0.511 0.596 0.6 0.115X 0.135X 0.135X
UnixTimeToLocalPtime: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (glibc) 2.69 2.75 2.8 1X 1X 1X
- (Google/CCTZ) 14.8 15.1 15.4 5.5X 5.49X 5.52X
+ (glibc) 4.51 4.53 4.53 1X 1X 1X
+ (Google/CCTZ) 23.7 24.1 24.4 5.25X 5.32X 5.4X
UnixTimeToUtcPtime: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (glibc) 17 17.6 17.9 1X 1X 1X
- (Google/CCTZ) 6.45 6.71 6.81 0.379X 0.382X 0.38X
- (fast path) 25.1 26 26.4 1.47X 1.48X 1.48X
- (day split) 48.6 50.3 51.3 2.85X 2.87X 2.86X
+ (glibc) 24.8 25.1 25.6 1X 1X 1X
+ (Google/CCTZ) 13.9 14 14.1 0.562X 0.557X 0.553X
+ (fast path) 54.3 54.8 55.4 2.19X 2.18X 2.17X
+ (day split) 196 199 200 7.89X 7.92X 7.81X
UtcFromUnixTimeMicros: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (sec split (old)) 17.9 18.7 19.1 1X 1X 1X
- (day split) 111 116 118 6.21X 6.19X 6.19X
+ (sec split (old)) 30.1 31 31.7 1X 1X 1X
+ (day split) 526 532 534 17.5X 17.2X 16.9X
FromUnixTimeNanos: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (sec split (old)) 18.7 19.5 19.8 1X 1X 1X
- (sec split (new)) 104 108 110 5.58X 5.55X 5.57X
+ (sec split (old)) 36.8 37.5 39 1X 1X 1X
+ (sec split (new)) 319 323 324 8.68X 8.61X 8.33X
FromSubsecondUnixTime: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
- (old) 18.7 18.7 18.7 1X 1X 1X
- (new) 73.5 74.1 74.1 3.94X 3.96X 3.96X
+ (old) 37.1 38 38.7 1X 1X 1X
+ (new) 175 175 177 4.71X 4.6X 4.59X
Number of threads: 8
UtcToUnixTime:
- (glibc) elapsed time: 1s020ms
- (Google/CCTZ) elapsed time: 144ms
- (boost) elapsed time: 10ms
-cctz speedup: 7.0784
-boost speedup: 95.1732
+ (glibc) elapsed time: 793ms
+ (Google/CCTZ) elapsed time: 152ms
+ (boost) elapsed time: 3ms
+cctz speedup: 5.20977
+boost speedup: 260.517
LocalToUnixTime:
- (glibc) elapsed time: 18s050ms
- (Google/CCTZ) elapsed time: 212ms
-speedup: 84.9949
+ (glibc) elapsed time: 21s750ms
+ (Google/CCTZ) elapsed time: 242ms
+speedup: 89.6388
FromUtc:
- (boost) elapsed time: 1s519ms
- (Google/CCTZ) elapsed time: 263ms
-speedup: 5.77003
+ (boost) elapsed time: 1s298ms
+ (Google/CCTZ) elapsed time: 279ms
+speedup: 4.64642
ToUtc:
- (boost) elapsed time: 1s674ms
- (Google/CCTZ) elapsed time: 325ms
-speedup: 5.13874
+ (boost) elapsed time: 1s436ms
+ (Google/CCTZ) elapsed time: 496ms
+speedup: 2.89263
UtcToLocal:
- (glibc) elapsed time: 4s862ms
- (Google/CCTZ) elapsed time: 263ms
-speedup: 18.4253
+ (glibc) elapsed time: 4s565ms
+ (Google/CCTZ) elapsed time: 260ms
+ (JVM) elapsed time: 2s507ms
+cctz speedup: 17.5058
+jvm speedup: 1.82038
UnixTimeToLocalPtime:
- (glibc) elapsed time: 4s856ms
- (Google/CCTZ) elapsed time: 259ms
-speedup: 18.7398
+ (glibc) elapsed time: 4s576ms
+ (Google/CCTZ) elapsed time: 268ms
+speedup: 17.0547
UnixTimeToUtcPtime:
- (glibc) elapsed time: 928ms
- (Google/CCTZ) elapsed time: 282ms
- (fast path) elapsed time: 90ms
-cctz speedup: 3.28187
-fast path speedup: 10.2951
+ (glibc) elapsed time: 478ms
+ (Google/CCTZ) elapsed time: 123ms
+ (fast path) elapsed time: 25ms
+cctz speedup: 3.87304
+fast path speedup: 18.9835
*/
vector AddTestDataDateTimes(int n, const string& startstr) {
@@ -223,11 +228,11 @@ public:
}
// Create and start threads.
- vector> threads(num_of_threads);
+ vector> threads(num_of_threads);
StopWatch sw;
sw.Start();
for (int i = 0; i < num_of_threads; ++i) {
- threads[i] = make_unique(
+ threads[i] = make_unique(
run_test, batch_size, test_data[i].get());
}
@@ -403,6 +408,13 @@ TimestampValue cctz_utc_to_local(const TimestampValue& ts_value) {
return TimestampValue(d, t);
}
+// JVM
+TimestampValue jvm_utc_to_local(const TimestampValue& ts_value) {
+ TimestampValue result = ts_value;
+ result.HiveLegacyUtcToLocal(*PTR_CCTZ_LOCAL_TZ);
+ return result;
+}
+
//
// Test FromUtc (CCTZ is expected to be faster than boost)
@@ -603,8 +615,7 @@ TimestampValue old_split_utc_from_unix_time_nanos(const SplitNanoAndSecond& unix
TimestampValue new_split_utc_from_unix_time_nanos(const SplitNanoAndSecond& unix_time) {
// The TimestampValue version is used as it is hard to reproduce the same logic without
// accessing private members.
- return TimestampValue::FromUnixTimeNanos(unix_time.seconds, unix_time.nanos,
- &TimezoneDatabase::GetUtcTimezone());
+ return TimestampValue::FromUnixTimeNanos(unix_time.seconds, unix_time.nanos, UTCPTR);
}
TimestampValue from_subsecond_unix_time_old(const double& unix_time) {
@@ -621,8 +632,7 @@ TimestampValue from_subsecond_unix_time_new(const double& unix_time) {
const double ONE_BILLIONTH = 0.000000001;
int64_t unix_time_whole = unix_time;
int64_t nanos = (unix_time - unix_time_whole) / ONE_BILLIONTH;
- return TimestampValue::FromUnixTimeNanos(
- unix_time_whole, nanos, &TimezoneDatabase::GetUtcTimezone());
+ return TimestampValue::FromUnixTimeNanos(unix_time_whole, nanos, UTCPTR);
}
//
@@ -698,7 +708,11 @@ TimestampVal cctz_to_utc(const TimestampVal& ts_val) {
int main(int argc, char* argv[]) {
- CpuInfo::Init();
+ impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
+ impala::InitFeSupport();
+ TestEnv test_env;
+ CHECK(test_env.Init().ok());
+
cout << Benchmark::GetMachineInfo() << endl;
ABORT_IF_ERROR(TimezoneDatabase::Initialize());
@@ -783,9 +797,12 @@ int main(int argc, char* argv[]) {
tsvalue_data;
TestData cctz_utc_to_local_data =
tsvalue_data;
+ TestData jvm_utc_to_local_data =
+ tsvalue_data;
glibc_utc_to_local_data.add_to_benchmark(bm_utc_to_local, "(glibc)");
cctz_utc_to_local_data.add_to_benchmark(bm_utc_to_local, "(Google/CCTZ)");
+ jvm_utc_to_local_data.add_to_benchmark(bm_utc_to_local, "(JVM)");
cout << bm_utc_to_local.Measure() << endl;
bail_if_results_dont_match(vector*>{
@@ -795,7 +812,7 @@ int main(int argc, char* argv[]) {
vector time_data;
for (const TimestampValue& tsvalue: tsvalue_data) {
time_t unix_time;
- tsvalue.ToUnixTime(&TimezoneDatabase::GetUtcTimezone(), &unix_time);
+ tsvalue.ToUnixTime(UTCPTR, &unix_time);
time_data.push_back(unix_time);
}
@@ -859,7 +876,7 @@ int main(int argc, char* argv[]) {
for (int i = 0; i < tsvalue_data.size(); ++i) {
const TimestampValue& tsvalue = tsvalue_data[i];
time_t unix_time;
- tsvalue.ToUnixTime(&TimezoneDatabase::GetUtcTimezone(), &unix_time);
+ tsvalue.ToUnixTime(UTCPTR, &unix_time);
int micros = (i * 1001) % MICROS_PER_SEC; // add some sub-second part
microsec_data.push_back(unix_time * MICROS_PER_SEC + micros);
}
@@ -885,7 +902,7 @@ int main(int argc, char* argv[]) {
for (int i = 0; i < tsvalue_data.size(); ++i) {
const TimestampValue& tsvalue = tsvalue_data[i];
time_t unix_time;
- tsvalue.ToUnixTime(&TimezoneDatabase::GetUtcTimezone(), &unix_time);
+ tsvalue.ToUnixTime(UTCPTR, &unix_time);
int nanos = (i * 1001) % NANOS_PER_SEC; // add some sub-second part
nanosec_data.push_back(SplitNanoAndSecond {unix_time, nanos} );
}
@@ -911,7 +928,7 @@ int main(int argc, char* argv[]) {
for (int i = 0; i < tsvalue_data.size(); ++i) {
const TimestampValue& tsvalue = tsvalue_data[i];
time_t unix_time;
- tsvalue.ToUnixTime(&TimezoneDatabase::GetUtcTimezone(), &unix_time);
+ tsvalue.ToUnixTime(UTCPTR, &unix_time);
double nanos = (i * 1001) % NANOS_PER_SEC; // add some sub-second part
double_data.push_back((double)unix_time + nanos / NANOS_PER_SEC);
}
@@ -979,7 +996,10 @@ int main(int argc, char* argv[]) {
num_of_threads, BATCH_SIZE, tsvalue_data, "(glibc)");
m2 = cctz_utc_to_local_data.measure_multithreaded_elapsed_time(
num_of_threads, BATCH_SIZE, tsvalue_data, "(Google/CCTZ)");
- cout << "speedup: " << double(m1)/double(m2) << endl;
+ m3 = jvm_utc_to_local_data.measure_multithreaded_elapsed_time(
+ num_of_threads, BATCH_SIZE, tsvalue_data, "(JVM)");
+ cout << "cctz speedup: " << double(m1)/double(m2) << endl;
+ cout << "jvm speedup: " << double(m1)/double(m3) << endl;
// UnixTimeToLocalPtime
cout << endl << "UnixTimeToLocalPtime:" << endl;
diff --git a/be/src/benchmarks/date-benchmark.cc b/be/src/benchmarks/date-benchmark.cc
index 84ccd6324..cca4b0bb0 100644
--- a/be/src/benchmarks/date-benchmark.cc
+++ b/be/src/benchmarks/date-benchmark.cc
@@ -26,6 +26,8 @@
#include "runtime/datetime-simple-date-format-parser.h"
#include "runtime/date-parse-util.h"
#include "runtime/timestamp-parse-util.h"
+#include "runtime/timestamp-value.h"
+#include "runtime/timestamp-value.inline.h"
#include "util/benchmark.h"
#include "util/cpu-info.h"
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index 9e33ff360..14aec5520 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -21,6 +21,8 @@
#include
#include
+#include
+#include
#include
#include
@@ -53,6 +55,9 @@
#include "common/names.h"
+using boost::adaptors::transformed;
+using boost::algorithm::iequals;
+using boost::algorithm::join;
using std::move;
using std::sort;
using namespace impala;
@@ -564,14 +569,13 @@ Status HdfsParquetScanner::ResolveSchemaForStatFiltering(SlotDescriptor* slot_de
}
ColumnStatsReader HdfsParquetScanner::CreateStatsReader(
- const parquet::FileMetaData& file_metadata, const parquet::RowGroup& row_group,
- SchemaNode* node, const ColumnType& col_type) {
+ const parquet::RowGroup& row_group, SchemaNode* node, const ColumnType& col_type) {
DCHECK(node);
int col_idx = node->col_idx;
DCHECK_LT(col_idx, row_group.columns.size());
- const vector& col_orders = file_metadata.column_orders;
+ const vector& col_orders = file_metadata_.column_orders;
const parquet::ColumnOrder* col_order =
col_idx < col_orders.size() ? &col_orders[col_idx] : nullptr;
@@ -585,8 +589,7 @@ ColumnStatsReader HdfsParquetScanner::CreateStatsReader(
return stat_reader;
}
-Status HdfsParquetScanner::EvaluateStatsConjuncts(
- const parquet::FileMetaData& file_metadata, const parquet::RowGroup& row_group,
+Status HdfsParquetScanner::EvaluateStatsConjuncts(const parquet::RowGroup& row_group,
bool* skip_row_group) {
*skip_row_group = false;
@@ -623,7 +626,7 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
}
ColumnStatsReader stats_reader =
- CreateStatsReader(file_metadata, row_group, node, slot_desc->type());
+ CreateStatsReader(row_group, node, slot_desc->type());
bool all_nulls = false;
if (stats_reader.AllNulls(&all_nulls) && all_nulls) {
@@ -686,8 +689,7 @@ bool HdfsParquetScanner::FilterAlreadyDisabledOrOverlapWithColumnStats(
}
Status HdfsParquetScanner::EvaluateOverlapForRowGroup(
- const parquet::FileMetaData& file_metadata, const parquet::RowGroup& row_group,
- bool* skip_row_group) {
+ const parquet::RowGroup& row_group, bool* skip_row_group) {
*skip_row_group = false;
if (!state_->query_options().parquet_read_statistics) return Status::OK();
@@ -764,7 +766,7 @@ Status HdfsParquetScanner::EvaluateOverlapForRowGroup(
break;
}
ColumnStatsReader stats_reader =
- CreateStatsReader(file_metadata, row_group, node, slot_desc->type());
+ CreateStatsReader(row_group, node, slot_desc->type());
bool all_nulls = false;
if (stats_reader.AllNulls(&all_nulls) && all_nulls) {
@@ -926,8 +928,7 @@ Status HdfsParquetScanner::NextRowGroup() {
// Evaluate row group statistics with stats conjuncts.
bool skip_row_group_on_stats;
- RETURN_IF_ERROR(
- EvaluateStatsConjuncts(file_metadata_, row_group, &skip_row_group_on_stats));
+ RETURN_IF_ERROR(EvaluateStatsConjuncts(row_group, &skip_row_group_on_stats));
if (skip_row_group_on_stats) {
COUNTER_ADD(num_stats_filtered_row_groups_counter_, 1);
continue;
@@ -935,8 +936,7 @@ Status HdfsParquetScanner::NextRowGroup() {
// Evaluate row group statistics with min/max filters.
bool skip_row_group_on_minmax;
- RETURN_IF_ERROR(
- EvaluateOverlapForRowGroup(file_metadata_, row_group, &skip_row_group_on_minmax));
+ RETURN_IF_ERROR(EvaluateOverlapForRowGroup(row_group, &skip_row_group_on_minmax));
if (skip_row_group_on_minmax) {
COUNTER_ADD(num_minmax_filtered_row_groups_counter_, 1);
continue;
@@ -1487,7 +1487,7 @@ Status HdfsParquetScanner::FindSkipRangesForPagesWithMinMaxFilters(
}
ColumnStatsReader stats_reader =
- CreateStatsReader(file_metadata_, row_group, node, slot_desc->type());
+ CreateStatsReader(row_group, node, slot_desc->type());
DCHECK_LT(col_idx, row_group.columns.size());
const parquet::ColumnChunk& col_chunk = row_group.columns[col_idx];
@@ -1570,7 +1570,7 @@ Status HdfsParquetScanner::EvaluatePageIndex() {
}
int col_idx = node->col_idx;;
ColumnStatsReader stats_reader =
- CreateStatsReader(file_metadata_, row_group, node, slot_desc->type());
+ CreateStatsReader(row_group, node, slot_desc->type());
DCHECK_LT(col_idx, row_group.columns.size());
const parquet::ColumnChunk& col_chunk = row_group.columns[col_idx];
@@ -2826,6 +2826,13 @@ Status HdfsParquetScanner::ProcessFooter() {
RETURN_IF_ERROR(ParquetMetadataUtils::ValidateFileVersion(file_metadata_, filename()));
+ if (VLOG_FILE_IS_ON) {
+ VLOG_FILE << "Parquet metadata for " << filename() << " created by "
+ << file_metadata_.created_by << ":\n"
+ << join(file_metadata_.key_value_metadata | transformed(
+ [](parquet::KeyValue kv) { return kv.key + "=" + kv.value; }), "\n");
+ }
+
// IMPALA-3943: Do not throw an error for empty files for backwards compatibility.
if (file_metadata_.num_rows == 0) {
// Warn if the num_rows is inconsistent with the row group metadata.
@@ -3157,8 +3164,41 @@ ParquetTimestampDecoder HdfsParquetScanner::CreateTimestampDecoder(
state_->query_options().convert_legacy_hive_parquet_utc_timestamps &&
state_->local_time_zone() != UTCPTR;
- return ParquetTimestampDecoder(element, state_->local_time_zone(),
- timestamp_conversion_needed_for_int96_timestamps);
+ const Timezone* timezone = state_->local_time_zone();
+ bool hive_legacy_conversion = false;
+ if (timestamp_conversion_needed_for_int96_timestamps && GetHiveZoneConversionLegacy()) {
+ VLOG_FILE << "Using Hive legacy timezone conversion";
+ hive_legacy_conversion = true;
+ }
+
+ return ParquetTimestampDecoder(element, timezone,
+ timestamp_conversion_needed_for_int96_timestamps, hive_legacy_conversion);
+}
+
+bool HdfsParquetScanner::GetHiveZoneConversionLegacy() const {
+ string writer_zone_conversion_legacy;
+ string writer_time_zone;
+ for (const parquet::KeyValue& kv : file_metadata_.key_value_metadata) {
+ if (kv.key == "writer.zone.conversion.legacy") {
+ writer_zone_conversion_legacy = kv.value;
+ } else if (kv.key == "writer.time.zone") {
+ writer_time_zone = kv.value;
+ }
+ }
+
+ if (writer_zone_conversion_legacy != "") {
+ return iequals(writer_zone_conversion_legacy, "true");
+ }
+
+ // There are no explicit meta about the legacy conversion.
+ if (writer_time_zone != "") {
+ // There is meta about the timezone thus we can infer that when the file was written,
+ // the new APIs were used.
+ return false;
+ }
+
+ // There is no (relevant) metadata in the file, use the configuration.
+ return state_->query_options().use_legacy_hive_timestamp_conversion;
}
void HdfsParquetScanner::UpdateCompressedPageSizeCounter(int64_t compressed_page_size) {
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.h b/be/src/exec/parquet/hdfs-parquet-scanner.h
index f3c8b51e1..42cfd7d2e 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.h
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.h
@@ -587,10 +587,9 @@ class HdfsParquetScanner : public HdfsColumnarScanner {
}
/// Evaluates the min/max predicates of the 'scan_node_' using the parquet::Statistics
- /// of 'row_group'. 'file_metadata' is used to determine the ordering that was used to
- /// compute the statistics. Sets 'skip_row_group' to true if the row group can be
- /// skipped, 'false' otherwise.
- Status EvaluateStatsConjuncts(const parquet::FileMetaData& file_metadata,
+ /// of 'row_group'. Sets 'skip_row_group' to true if the row group can be skipped,
+ /// 'false' otherwise.
+ Status EvaluateStatsConjuncts(
const parquet::RowGroup& row_group, bool* skip_row_group) WARN_UNUSED_RESULT;
/// Advances 'row_group_idx_' to the next non-empty row group and initializes
@@ -601,12 +600,10 @@ class HdfsParquetScanner : public HdfsColumnarScanner {
Status NextRowGroup() WARN_UNUSED_RESULT;
/// Evaluates the overlap predicates of the 'scan_node_' using the parquet::Statistics
- /// of 'row_group'. 'file_metadata' is used to determine the ordering that was used to
- /// compute the statistics. Sets 'skip_row_group' to true if the row group can be
- /// skipped, 'false' otherwise.
+ /// of 'row_group'. Sets 'skip_row_group' to true if the row group can be skipped,
+ /// 'false' otherwise.
Status EvaluateOverlapForRowGroup(
- const parquet::FileMetaData& file_metadata, const parquet::RowGroup& row_group,
- bool* skip_row_group);
+ const parquet::RowGroup& row_group, bool* skip_row_group);
/// Return true if filter 'minmax_filter' of fitler id 'filter_id' is too close to
/// column min/max stats available at the target desc entry targets[0] in
@@ -631,12 +628,12 @@ class HdfsParquetScanner : public HdfsColumnarScanner {
SchemaNode** schema_node_ptr = nullptr);
/// Create a ColumnStatsReader object for a column chunk described by a schema
- /// path in a slot descriptor 'slot_desc'. 'file_metadata', 'row_group', 'node',
- /// and 'col_type' provide extra data needed.
+ /// path in a slot descriptor 'slot_desc'. 'row_group', 'node', and 'col_type' provide
+ /// extra data needed.
/// On return:
/// A column chunk stats reader ('ColumnStatsReader') is returned.
- ColumnStatsReader CreateStatsReader(const parquet::FileMetaData& file_metadata,
- const parquet::RowGroup& row_group, SchemaNode* node, const ColumnType& col_type);
+ ColumnStatsReader CreateStatsReader(const parquet::RowGroup& row_group,
+ SchemaNode* node, const ColumnType& col_type);
/// Return the overlap predicate descs from the HDFS scan plan.
const vector& GetOverlapPredicateDescs();
@@ -944,6 +941,10 @@ class HdfsParquetScanner : public HdfsColumnarScanner {
/// then we skip to row index 'skip_to_row'.
Status SkipRowsForColumns(const vector& column_readers,
int64_t* num_rows_to_skip, int64_t* skip_to_row);
+
+ /// Returns whether Hive legacy zone conversion should be used for transforming
+ /// timestamps based on file metadata and configuration.
+ bool GetHiveZoneConversionLegacy() const;
};
} // namespace impala
diff --git a/be/src/exec/parquet/parquet-common.cc b/be/src/exec/parquet/parquet-common.cc
index 34133ea4d..ee284e4b3 100644
--- a/be/src/exec/parquet/parquet-common.cc
+++ b/be/src/exec/parquet/parquet-common.cc
@@ -255,7 +255,8 @@ bool ParquetTimestampDecoder::GetTimestampInfoFromSchema(const parquet::SchemaEl
}
ParquetTimestampDecoder::ParquetTimestampDecoder(const parquet::SchemaElement& e,
- const Timezone* timezone, bool convert_int96_timestamps) {
+ const Timezone* timezone, bool convert_int96_timestamps,
+ bool hive_legacy_conversion) : hive_legacy_conversion_(hive_legacy_conversion) {
bool needs_conversion = false;
bool valid_schema = GetTimestampInfoFromSchema(e, precision_, needs_conversion);
DCHECK(valid_schema); // Invalid schemas should be rejected in an earlier step.
@@ -267,7 +268,15 @@ void ParquetTimestampDecoder::ConvertMinStatToLocalTime(TimestampValue* v) const
DCHECK(timezone_ != nullptr);
if (!v->HasDateAndTime()) return;
TimestampValue repeated_period_start;
- v->UtcToLocal(*timezone_, &repeated_period_start);
+ if (hive_legacy_conversion_) {
+ // Hive legacy conversion does not have efficient tools for identifying repeated
+ // periods, so subtract a day to ensure we cover all possible repeated periods
+ // (such as switching from UTC- to UTC+ near the international date line).
+ v->HiveLegacyUtcToLocal(*timezone_);
+ v->Subtract(boost::posix_time::hours(24));
+ } else {
+ v->UtcToLocal(*timezone_, &repeated_period_start);
+ }
if (repeated_period_start.HasDateAndTime()) *v = repeated_period_start;
}
@@ -275,7 +284,15 @@ void ParquetTimestampDecoder::ConvertMaxStatToLocalTime(TimestampValue* v) const
DCHECK(timezone_ != nullptr);
if (!v->HasDateAndTime()) return;
TimestampValue repeated_period_end;
- v->UtcToLocal(*timezone_, nullptr, &repeated_period_end);
+ if (hive_legacy_conversion_) {
+ // Hive legacy conversion does not have efficient tools for identifying repeated
+ // periods, so add a day to ensure we cover all possible repeated periods
+ // (such as switching from UTC- to UTC+ near the international date line).
+ v->HiveLegacyUtcToLocal(*timezone_);
+ v->Add(boost::posix_time::hours(24));
+ } else {
+ v->UtcToLocal(*timezone_, nullptr, &repeated_period_end);
+ }
if (repeated_period_end.HasDateAndTime()) *v = repeated_period_end;
}
}
diff --git a/be/src/exec/parquet/parquet-common.h b/be/src/exec/parquet/parquet-common.h
index 2b215e150..72ad18015 100644
--- a/be/src/exec/parquet/parquet-common.h
+++ b/be/src/exec/parquet/parquet-common.h
@@ -764,7 +764,7 @@ public:
ParquetTimestampDecoder() {}
ParquetTimestampDecoder( const parquet::SchemaElement& e, const Timezone* timezone,
- bool convert_int96_timestamps);
+ bool convert_int96_timestamps, bool hive_legacy_conversion);
bool NeedsConversion() const { return timezone_ != nullptr; }
@@ -798,7 +798,13 @@ public:
void ConvertToLocalTime(TimestampValue* v) const {
DCHECK(timezone_ != nullptr);
- if (v->HasDateAndTime()) v->UtcToLocal(*timezone_);
+ if (v->HasDateAndTime()) {
+ if (hive_legacy_conversion_) {
+ v->HiveLegacyUtcToLocal(*timezone_);
+ } else {
+ v->UtcToLocal(*timezone_);
+ }
+ }
}
/// Timezone conversion of min/max stats need some extra logic because UTC->local
@@ -831,6 +837,9 @@ private:
/// INT64 decoding. INT64 with nanosecond precision (and reduced range) is also planned
/// to be implemented once it is added in Parquet (PARQUET-1387).
Precision precision_ = NANO;
+
+ /// Use Hive legacy-compatible conversion with Java DateFormat.
+ bool hive_legacy_conversion_ = false;
};
template <>
diff --git a/be/src/runtime/raw-value-test.cc b/be/src/runtime/raw-value-test.cc
index 090178f1e..066adca69 100644
--- a/be/src/runtime/raw-value-test.cc
+++ b/be/src/runtime/raw-value-test.cc
@@ -19,6 +19,8 @@
#include "exprs/timezone_db.h"
#include "runtime/raw-value.inline.h"
+#include "runtime/timestamp-value.h"
+#include "runtime/timestamp-value.inline.h"
#include "testutil/gtest-util.h"
#include "common/names.h"
diff --git a/be/src/runtime/timestamp-value.cc b/be/src/runtime/timestamp-value.cc
index 417ef467d..9429704b8 100644
--- a/be/src/runtime/timestamp-value.cc
+++ b/be/src/runtime/timestamp-value.cc
@@ -17,14 +17,19 @@
#include "runtime/timestamp-value.h"
+#include
+
#include "exprs/timestamp-functions.h"
#include "exprs/timezone_db.h"
#include "runtime/datetime-simple-date-format-parser.h"
+#include "runtime/exec-env.h"
#include "runtime/timestamp-parse-util.h"
#include "runtime/timestamp-value.inline.h"
+#include "service/frontend.h"
#include "common/names.h"
+using boost::algorithm::starts_with;
using boost::date_time::not_a_date_time;
using boost::gregorian::date;
using boost::gregorian::date_duration;
@@ -155,6 +160,32 @@ void TimestampValue::UtcToLocal(const Timezone& local_tz,
}
}
+void TimestampValue::HiveLegacyUtcToLocal(const Timezone& local_tz) {
+ DCHECK(HasDateAndTime());
+ int64_t utc_time_millis;
+ if (UNLIKELY(!FloorUtcToUnixTimeMillis(&utc_time_millis))) {
+ SetToInvalidDateTime();
+ return;
+ }
+
+ string tz = local_tz.name();
+ static constexpr std::string_view zoneinfo = "/usr/share/zoneinfo/";
+ if (starts_with(tz, zoneinfo)) {
+ tz = tz.substr(zoneinfo.size());
+ }
+
+ TCivilTime cs;
+ Status status = ExecEnv::GetInstance()->frontend()->HiveLegacyTimezoneConvert(
+ tz, utc_time_millis, &cs);
+ if (UNLIKELY(!status.ok())) {
+ // This would result in log spam. However it should be impossible to fail.
+ LOG(ERROR) << "Timezone " << tz << " cannot be used with legacy Hive conversion.";
+ return;
+ }
+ date_ = boost::gregorian::date(cs.year, cs.month, cs.day);
+ time_ = time_duration(cs.hour, cs.minute, cs.second, time_.fractional_seconds());
+}
+
void TimestampValue::LocalToUtc(const Timezone& local_tz,
TimestampValue* pre_utc_if_repeated, TimestampValue* post_utc_if_repeated) {
DCHECK(HasDateAndTime());
@@ -227,14 +258,6 @@ TimestampValue TimestampValue::UnixTimeToLocal(
}
}
-TimestampValue TimestampValue::FromUnixTime(time_t unix_time, const Timezone* local_tz) {
- if (local_tz != UTCPTR) {
- return UnixTimeToLocal(unix_time, *local_tz);
- } else {
- return UtcFromUnixTimeTicks<1>(unix_time);
- }
-}
-
void TimestampValue::ToString(string& dst) const {
dst.resize(SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN);
const int out_len = TimestampParser::FormatDefault(date(), time(), dst.data());
diff --git a/be/src/runtime/timestamp-value.h b/be/src/runtime/timestamp-value.h
index f1fb126a8..02cfe7a62 100644
--- a/be/src/runtime/timestamp-value.h
+++ b/be/src/runtime/timestamp-value.h
@@ -296,6 +296,11 @@ class TimestampValue {
TimestampValue* start_of_repeated_period = nullptr,
TimestampValue* end_of_repeated_period = nullptr);
+ /// Converts from UTC to 'local_tz' time zone in-place. The caller must ensure the
+ /// TimestampValue this function is called upon has both a valid date and time. Uses
+ /// Java Calendar for conversion to match Hive's legacy conversion process.
+ void HiveLegacyUtcToLocal(const Timezone& local_tz);
+
/// Converts from 'local_tz' to UTC time zone in-place. The caller must ensure the
/// TimestampValue this function is called upon has both a valid date and time.
///
diff --git a/be/src/runtime/timestamp-value.inline.h b/be/src/runtime/timestamp-value.inline.h
index d49dbb2ae..6cfc63b6d 100644
--- a/be/src/runtime/timestamp-value.inline.h
+++ b/be/src/runtime/timestamp-value.inline.h
@@ -77,6 +77,15 @@ inline TimestampValue TimestampValue::UtcFromUnixTimeLimitedRangeNanos(
return UtcFromUnixTimeTicks(unix_time_nanos);
}
+inline TimestampValue TimestampValue::FromUnixTime(time_t unix_time,
+ const Timezone* local_tz) {
+ if (local_tz != UTCPTR) {
+ return UnixTimeToLocal(unix_time, *local_tz);
+ } else {
+ return UtcFromUnixTimeTicks<1>(unix_time);
+ }
+}
+
inline TimestampValue TimestampValue::FromUnixTimeNanos(time_t unix_time, int64_t nanos,
const Timezone* local_tz) {
unix_time =
diff --git a/be/src/service/frontend.cc b/be/src/service/frontend.cc
index aff23809f..22f3cdb77 100644
--- a/be/src/service/frontend.cc
+++ b/be/src/service/frontend.cc
@@ -157,7 +157,8 @@ Frontend::Frontend() {
};
JniMethodDescriptor staticMethods[] = {
- {"getSecretFromKeyStore", "([B)Ljava/lang/String;", &get_secret_from_key_store_}
+ {"getSecretFromKeyStore", "([B)Ljava/lang/String;", &get_secret_from_key_store_},
+ {"hiveLegacyTimezoneConvert", "([BJ)[B", &hive_legacy_timezone_convert_}
};
JNIEnv* jni_env = JniUtil::GetJNIEnv();
@@ -436,3 +437,12 @@ Status Frontend::GetSecretFromKeyStore(const string& secret_key, string* secret)
return JniUtil::CallStaticJniMethod(fe_class_, get_secret_from_key_store_, secret_key_t,
secret);
}
+
+Status Frontend::HiveLegacyTimezoneConvert(
+ const string& timezone, long utc_time_millis, TCivilTime* local_time) {
+ TStringLiteral timezone_t;
+ timezone_t.__set_value(timezone);
+ return JniCall::static_method(fe_class_, hive_legacy_timezone_convert_)
+ .with_thrift_arg(timezone_t).with_primitive_arg(utc_time_millis)
+ .Call(local_time);
+}
diff --git a/be/src/service/frontend.h b/be/src/service/frontend.h
index 823f98ca8..e2f0b551f 100644
--- a/be/src/service/frontend.h
+++ b/be/src/service/frontend.h
@@ -249,6 +249,10 @@ class Frontend {
/// Get secret from jceks key store for the input secret_key.
Status GetSecretFromKeyStore(const string& secret_key, string* secret);
+ /// Convert UTC UNIX time (in millis) to target timezone using Hive legacy conversion.
+ Status HiveLegacyTimezoneConvert(
+ const string& timezone, long utc_time_millis, TCivilTime* local_time);
+
private:
jclass fe_class_; // org.apache.impala.service.JniFrontend class
jobject fe_; // instance of org.apache.impala.service.JniFrontend
@@ -292,6 +296,7 @@ class Frontend {
jmethodID commit_kudu_txn_; // JniFrontend.commitKuduTransaction()
jmethodID convertTable; // JniFrontend.convertTable
jmethodID get_secret_from_key_store_; // JniFrontend.getSecretFromKeyStore()
+ jmethodID hive_legacy_timezone_convert_; // JniFrontend.hiveLegacyTimezoneConvert()
// Only used for testing.
jmethodID build_test_descriptor_table_id_; // JniFrontend.buildTestDescriptorTable()
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 44d571a80..5da1c6eb8 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -1330,6 +1330,10 @@ Status impala::SetQueryOption(TImpalaQueryOptions::type option, const string& va
query_options->__set_estimate_duplicate_in_preagg(IsTrue(value));
break;
}
+ case TImpalaQueryOptions::USE_LEGACY_HIVE_TIMESTAMP_CONVERSION: {
+ query_options->__set_use_legacy_hive_timestamp_conversion(IsTrue(value));
+ break;
+ }
default:
string key = to_string(option);
if (IsRemovedQueryOption(key)) {
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index b5587ec2a..6ce06aea7 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -51,7 +51,7 @@ typedef std::unordered_map
// plus one. Thus, the second argument to the DCHECK has to be updated every
// time we add or remove a query option to/from the enum TImpalaQueryOptions.
constexpr unsigned NUM_QUERY_OPTIONS =
- TImpalaQueryOptions::ESTIMATE_DUPLICATE_IN_PREAGG + 1;
+ TImpalaQueryOptions::USE_LEGACY_HIVE_TIMESTAMP_CONVERSION + 1;
#define QUERY_OPTS_TABLE \
DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(), NUM_QUERY_OPTIONS); \
REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED) \
@@ -364,6 +364,8 @@ constexpr unsigned NUM_QUERY_OPTIONS =
ENABLE_TUPLE_ANALYSIS_IN_AGGREGATE, TQueryOptionLevel::ADVANCED) \
QUERY_OPT_FN(estimate_duplicate_in_preagg, \
ESTIMATE_DUPLICATE_IN_PREAGG, TQueryOptionLevel::ADVANCED) \
+ QUERY_OPT_FN(use_legacy_hive_timestamp_conversion, \
+ USE_LEGACY_HIVE_TIMESTAMP_CONVERSION, TQueryOptionLevel::ADVANCED)
;
/// Enforce practical limits on some query options to avoid undesired query state.
diff --git a/common/thrift/Frontend.thrift b/common/thrift/Frontend.thrift
index bd70374e9..15007ae2a 100644
--- a/common/thrift/Frontend.thrift
+++ b/common/thrift/Frontend.thrift
@@ -1092,3 +1092,17 @@ struct TWrappedHttpResponse {
5: optional string content
6: optional string content_type
}
+
+// Captures civil time - local time in a specific time zone - mirroring
+// cctz::civil_second. Used to serialize Java timezone conversions back to C++ code.
+// Omits subsecond measurements because
+// - matches cctz::civil_second; no known timezone libraries have subsecond adjustments
+// - Java timezone conversion is only accurate to milliseconds, but we use nanoseconds
+struct TCivilTime {
+ 1: required i32 year
+ 2: required i32 month
+ 3: required i32 day
+ 4: required i32 hour
+ 5: required i32 minute
+ 6: required i32 second
+}
diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift
index 90992a269..b28754848 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -977,6 +977,15 @@ enum TImpalaQueryOptions {
// If True, account for probability of having duplicate grouping key exist in multiple
// nodes during preaggreation.
ESTIMATE_DUPLICATE_IN_PREAGG = 185
+
+ // When true and CONVERT_LEGACY_HIVE_PARQUET_UTC_TIMESTAMPS is also enabled, TIMESTAMP
+ // conversion to local time will fallback to the timestamp conversion method from Hive
+ // 3.0 and earlier if not specified in the file. This matches the Hive option
+ // 'hive.parquet.timestamp.legacy.conversion.enabled', which defaults to true. Impala
+ // defaults to false because conversion is ~50x slower than Impala's default conversion
+ // method and they produce the same results for modern time periods (post 1970, and in
+ // most instances before that).
+ USE_LEGACY_HIVE_TIMESTAMP_CONVERSION = 186
}
// The summary of a DML statement.
diff --git a/common/thrift/Query.thrift b/common/thrift/Query.thrift
index 851f5a573..0e8458082 100644
--- a/common/thrift/Query.thrift
+++ b/common/thrift/Query.thrift
@@ -759,6 +759,9 @@ struct TQueryOptions {
// See comment in ImpalaService.thrift
186: optional bool estimate_duplicate_in_preagg = true
+
+ // See comment in ImpalaService.thrift
+ 187: optional bool use_legacy_hive_timestamp_conversion = false;
}
// Impala currently has three types of sessions: Beeswax, HiveServer2 and external
diff --git a/docs/topics/impala_timestamp.xml b/docs/topics/impala_timestamp.xml
index efa367e83..33e7d40ba 100644
--- a/docs/topics/impala_timestamp.xml
+++ b/docs/topics/impala_timestamp.xml
@@ -263,6 +263,43 @@ DATE_ADD (timestamp, INTERVAL interval
+
+ Hive versions prior to 3.1 wrote Parquet files in local time using Java's
+ SimpleDateFormat. That method has some cases that differ from both Impala's
+ method and the default method used in Hive 3.1.2+ that are based on the
+
+ IANA Time Zone Database. Hive 4 added the
+ writer.zone.conversion.legacy Parquet file metadata property
+ to identify which method was used to write the file (controlled by
+ hive.parquet.timestamp.write.legacy.conversion.enabled). When
+ the Parquet file was written by Parquet Java (parquet-mr), Hive -
+ and Impala's behavior when
+ convert_legacy_hive_parquet_utc_timestamps is
+ true - are:
+
+ -
+ If writer.zone.conversion.legacy is present, use the legacy
+ conversion method if true, use the newer method if false.
+
+ -
+ If writer.zone.conversion.legacy is not present but
+ writer.time.zone is, we can infer the file was written by
+ Hive 3.1.2+ using new APIs and use the newer method.
+
+ -
+ Otherwise assume it was written by an earlier Hive release. In that case
+ Hive will select conversion method based on
+ hive.parquet.timestamp.legacy.conversion.enabled (defaults
+ to true). adds the query
+ option use_legacy_hive_timestamp_conversion to select this
+ behavior. It defaults to false because conversion is ~50x
+ slower than Impala's default conversion method and they produce the same
+ results for modern time periods (post 1970, and in most instances before
+ that).
+
+
+
+
Hive currently cannot write INT64 TIMESTAMP values.
diff --git a/fe/src/main/java/org/apache/impala/service/JniFrontend.java b/fe/src/main/java/org/apache/impala/service/JniFrontend.java
index 3a3e4d77f..c737d2f17 100644
--- a/fe/src/main/java/org/apache/impala/service/JniFrontend.java
+++ b/fe/src/main/java/org/apache/impala/service/JniFrontend.java
@@ -54,6 +54,7 @@ import org.apache.impala.service.Frontend.PlanCtx;
import org.apache.impala.thrift.TBackendGflags;
import org.apache.impala.thrift.TBuildTestDescriptorTableParams;
import org.apache.impala.thrift.TCatalogObject;
+import org.apache.impala.thrift.TCivilTime;
import org.apache.impala.thrift.TDatabase;
import org.apache.impala.thrift.TDescribeDbParams;
import org.apache.impala.thrift.TDescribeResult;
@@ -115,11 +116,14 @@ import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.lang.IllegalArgumentException;
+import java.util.Calendar;
import java.util.Collections;
+import java.util.GregorianCalendar;
import java.util.Enumeration;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.TimeZone;
/**
* JNI-callable interface onto a wrapped Frontend instance. The main point is to serialise
@@ -808,6 +812,29 @@ public class JniFrontend {
return secret;
}
+ /**
+ * Performs Hive legacy-compatible timezone conversion
+ */
+ public static byte[] hiveLegacyTimezoneConvert(byte[] timezoneT, long utc_time_millis)
+ throws ImpalaException {
+ final TStringLiteral timezone = new TStringLiteral();
+ JniUtil.deserializeThrift(protocolFactory_, timezone, timezoneT);
+ // TimeZone.getTimeZone defaults to GMT if it doesn't recognize the timezone.
+ Calendar c = new GregorianCalendar(TimeZone.getTimeZone(timezone.getValue()));
+ c.setTimeInMillis(utc_time_millis);
+
+ final TCivilTime civilTime = new TCivilTime(
+ // Normalize month so January starts at 1.
+ c.get(Calendar.YEAR), c.get(Calendar.MONTH)+1, c.get(Calendar.DAY_OF_MONTH),
+ c.get(Calendar.HOUR_OF_DAY), c.get(Calendar.MINUTE), c.get(Calendar.SECOND));
+ try {
+ TSerializer serializer = new TSerializer(protocolFactory_);
+ return serializer.serialize(civilTime);
+ } catch (TException e) {
+ throw new InternalException(e.getMessage());
+ }
+ }
+
public String validateSaml2Bearer(byte[] serializedRequest) throws ImpalaException{
Preconditions.checkNotNull(frontend_);
Preconditions.checkNotNull(frontend_.getSaml2Client());
diff --git a/testdata/data/employee_hive_3_1_3_us_pacific.parquet b/testdata/data/employee_hive_3_1_3_us_pacific.parquet
new file mode 100644
index 000000000..2125bc688
Binary files /dev/null and b/testdata/data/employee_hive_3_1_3_us_pacific.parquet differ
diff --git a/testdata/data/hive_kuala_lumpur_legacy.parquet b/testdata/data/hive_kuala_lumpur_legacy.parquet
new file mode 100644
index 000000000..f03efd4ff
Binary files /dev/null and b/testdata/data/hive_kuala_lumpur_legacy.parquet differ
diff --git a/testdata/data/tbl_parq1/000000_0 b/testdata/data/tbl_parq1/000000_0
new file mode 100644
index 000000000..17a910f52
Binary files /dev/null and b/testdata/data/tbl_parq1/000000_0 differ
diff --git a/testdata/data/tbl_parq1/000000_1 b/testdata/data/tbl_parq1/000000_1
new file mode 100644
index 000000000..cc0c27e6a
Binary files /dev/null and b/testdata/data/tbl_parq1/000000_1 differ
diff --git a/testdata/data/tbl_parq1/000000_2 b/testdata/data/tbl_parq1/000000_2
new file mode 100644
index 000000000..ba04470e2
Binary files /dev/null and b/testdata/data/tbl_parq1/000000_2 differ
diff --git a/testdata/workloads/functional-query/queries/QueryTest/timestamp-conversion-hive-313.test b/testdata/workloads/functional-query/queries/QueryTest/timestamp-conversion-hive-313.test
new file mode 100644
index 000000000..b92aa96fc
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/timestamp-conversion-hive-313.test
@@ -0,0 +1,46 @@
+====
+---- QUERY
+SET timezone=PST;
+SELECT * FROM employee_hive_3_1_3_us_pacific;
+---- TYPES
+INT,TIMESTAMP
+---- RESULTS
+1,1880-01-01 07:52:58
+2,1884-01-01 08:00:00
+3,1990-01-01 08:00:00
+====
+---- QUERY
+SET timezone=PST;
+SET convert_legacy_hive_parquet_utc_timestamps=true;
+SELECT * FROM employee_hive_3_1_3_us_pacific;
+---- TYPES
+INT,TIMESTAMP
+---- RESULTS
+1,1880-01-01 00:00:00
+2,1884-01-01 00:00:00
+3,1990-01-01 00:00:00
+====
+---- QUERY
+SET timezone=PST;
+SET convert_legacy_hive_parquet_utc_timestamps=true;
+SET use_legacy_hive_timestamp_conversion=true;
+SELECT * FROM employee_hive_3_1_3_us_pacific;
+---- TYPES
+INT,TIMESTAMP
+---- RESULTS
+1,1880-01-01 00:00:00
+2,1884-01-01 00:00:00
+3,1990-01-01 00:00:00
+====
+---- QUERY
+SET timezone=UTC;
+SET convert_legacy_hive_parquet_utc_timestamps=true;
+SET use_legacy_hive_timestamp_conversion=true;
+SELECT * FROM employee_hive_3_1_3_us_pacific;
+---- TYPES
+INT,TIMESTAMP
+---- RESULTS
+1,1880-01-01 07:52:58
+2,1884-01-01 08:00:00
+3,1990-01-01 08:00:00
+====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/timestamp-conversion-hive-3m.test b/testdata/workloads/functional-query/queries/QueryTest/timestamp-conversion-hive-3m.test
new file mode 100644
index 000000000..84e61bff3
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/timestamp-conversion-hive-3m.test
@@ -0,0 +1,34 @@
+====
+---- QUERY
+SET timezone="Asia/Singapore";
+SELECT * FROM t;
+---- RESULTS
+1899-12-31 16:00:00
+1899-12-31 16:00:00
+1899-12-31 17:04:35
+2020-04-08 05:17:05.215000000
+2020-04-08 05:17:05.215000000
+====
+---- QUERY
+SET timezone="Asia/Singapore";
+SET convert_legacy_hive_parquet_utc_timestamps=true;
+SELECT * FROM t;
+---- RESULTS
+1899-12-31 22:55:25
+1899-12-31 22:55:25
+1900-01-01 00:00:00
+2020-04-08 13:17:05.215000000
+2020-04-08 13:17:05.215000000
+====
+---- QUERY
+SET timezone="Asia/Singapore";
+SET convert_legacy_hive_parquet_utc_timestamps=true;
+SET use_legacy_hive_timestamp_conversion=true;
+SELECT * FROM t;
+---- RESULTS
+1899-12-31 22:55:25
+1900-01-01 00:00:00
+1900-01-01 00:00:00
+2020-04-08 13:17:05.215000000
+2020-04-08 13:17:05.215000000
+====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/timestamp-conversion-hive-4.test b/testdata/workloads/functional-query/queries/QueryTest/timestamp-conversion-hive-4.test
new file mode 100644
index 000000000..f97a805c3
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/timestamp-conversion-hive-4.test
@@ -0,0 +1,28 @@
+====
+---- QUERY
+SET timezone="Asia/Kuala_Lumpur";
+SELECT * FROM hive_kuala_lumpur_legacy;
+---- RESULTS
+1899-12-31 16:00:00
+1909-12-31 17:00:00
+1934-12-31 16:40:00
+1939-12-31 16:40:00
+1941-12-31 16:30:00
+1943-12-31 15:00:00
+1969-01-28 16:30:00
+1999-12-31 16:00:00
+====
+---- QUERY
+SET timezone="Asia/Kuala_Lumpur";
+SET convert_legacy_hive_parquet_utc_timestamps=true;
+SELECT * FROM hive_kuala_lumpur_legacy;
+---- RESULTS
+1900-01-01 00:00:00
+1910-01-01 00:00:00
+1935-01-01 00:00:00
+1940-01-01 00:00:00
+1942-01-01 00:00:00
+1944-01-01 00:00:00
+1969-01-29 00:00:00
+2000-01-01 00:00:00
+====
diff --git a/tests/query_test/test_hive_timestamp_conversion.py b/tests/query_test/test_hive_timestamp_conversion.py
new file mode 100644
index 000000000..2d594f144
--- /dev/null
+++ b/tests/query_test/test_hive_timestamp_conversion.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import, division, print_function
+
+from tests.common.impala_test_suite import ImpalaTestSuite
+from tests.common.file_utils import create_table_and_copy_files, create_table_from_parquet
+
+
+class TestHiveParquetTimestampConversion(ImpalaTestSuite):
+ """Tests that Impala can read parquet files written by older versions of Hive or with
+ Hive legacy conversion enabled. Tests use convert_legacy_hive_parquet_utc_timestamps,
+ use_legacy_hive_timestamp_conversion, and timezone to test conversion."""
+
+ @classmethod
+ def get_workload(self):
+ return 'functional-query'
+
+ @classmethod
+ def add_test_dimensions(cls):
+ super(TestHiveParquetTimestampConversion, cls).add_test_dimensions()
+ cls.ImpalaTestMatrix.add_constraint(lambda v:
+ v.get_value('table_format').file_format == 'parquet'
+ and v.get_value('table_format').compression_codec == 'none')
+
+ def test_hive_4_legacy(self, vector, unique_database):
+ """Test that legacy conversion uses the same timezone conversion as Hive when
+ Parquet metadata contains writer.zone.conversion.legacy=true.
+
+ Load test data generated via Hive with TZ=Asia/Kuala_Lumpur:
+
+ create table t (d timestamp) stored as parquet;
+ set hive.parquet.timestamp.write.legacy.conversion.enabled=true;
+ insert into t values ("1900-01-01 00:00:00"), ("1910-01-01 00:00:00"),
+ ("1935-01-01 00:00:00"), ("1940-01-01 00:00:00"), ("1942-01-01 00:00:00"),
+ ("1944-01-01 00:00:00"), ("1969-01-29 00:00:00"), ("2000-01-01 00:00:00");
+ """
+ create_table_from_parquet(self.client, unique_database, "hive_kuala_lumpur_legacy")
+ self.run_test_case("QueryTest/timestamp-conversion-hive-4", vector, unique_database)
+
+ def test_hive_313(self, vector, unique_database):
+ """The parquet file was written with Hive 3.1.3 using the new Date/Time APIs
+ (legacy=false) to convert from US/Pacific to UTC. The presence of writer.time.zone in
+ the metadata of the file allow us to infer that new Date/Time APIS should be used for
+ the conversion. The use_legacy_hive_timestamp_conversion property shouldn't be taken
+ into account in this case.
+
+ Test file from https://github.com/apache/hive/blob/rel/release-4.0.1/data/files/
+ employee_hive_3_1_3_us_pacific.parquet"""
+ create_table_from_parquet(
+ self.client, unique_database, "employee_hive_3_1_3_us_pacific")
+ self.run_test_case("QueryTest/timestamp-conversion-hive-313", vector, unique_database)
+
+ def test_hive_3_mixed(self, vector, unique_database):
+ """Test table containing Hive legacy timestamps written with Hive prior to 3.1.3.
+
+ Test files target timezone=Asia/Singapore, sourced from
+ https://github.com/apache/hive/tree/rel/release-4.0.1/data/files/tbl_parq1."""
+ create_stmt = "create table %s.t (d timestamp) stored as parquet" % unique_database
+ create_table_and_copy_files(self.client, create_stmt, unique_database, "t",
+ ["testdata/data/tbl_parq1/" + f for f in ["000000_0", "000000_1", "000000_2"]])
+ self.run_test_case("QueryTest/timestamp-conversion-hive-3m", vector, unique_database)