diff --git a/be/src/exec/parquet-column-stats.h b/be/src/exec/parquet-column-stats.h index e9cf80193..44d4a6591 100644 --- a/be/src/exec/parquet-column-stats.h +++ b/be/src/exec/parquet-column-stats.h @@ -62,6 +62,20 @@ class ColumnStatsBase { /// the minimum or maximum value. enum class StatsField { MIN, MAX }; + /// min and max functions for types that are not floating point numbers + template + struct MinMaxTrait { + static decltype(auto) MinValue(const T& a, const T& b) { return std::min(a, b); } + static decltype(auto) MaxValue(const T& a, const T& b) { return std::max(a, b); } + }; + + /// min and max functions for floating point types + template + struct MinMaxTrait::value>> { + static decltype(auto) MinValue(const T& a, const T& b) { return std::fmin(a, b); } + static decltype(auto) MaxValue(const T& a, const T& b) { return std::fmax(a, b); } + }; + ColumnStatsBase() : has_min_max_values_(false), null_count_(0) {} virtual ~ColumnStatsBase() {} diff --git a/be/src/exec/parquet-column-stats.inline.h b/be/src/exec/parquet-column-stats.inline.h index 5b67ee7c1..0b618f972 100644 --- a/be/src/exec/parquet-column-stats.inline.h +++ b/be/src/exec/parquet-column-stats.inline.h @@ -37,8 +37,8 @@ inline void ColumnStats::Update(const T& min_value, const T& max_value) { min_value_ = min_value; max_value_ = max_value; } else { - min_value_ = std::min(min_value_, min_value); - max_value_ = std::max(max_value_, max_value); + min_value_ = MinMaxTrait::MinValue(min_value_, min_value); + max_value_ = MinMaxTrait::MaxValue(max_value_, max_value); } } diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test index 273dff8c0..73ad3e4fe 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test @@ -492,9 +492,9 @@ aggregation(SUM, NumRowGroups): 1 aggregation(SUM, NumStatsFilteredRowGroups): 0 ==== ---- QUERY -# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering. -# When the first value is NaN in a column chunk, Impala chooses it as min_value and -# max_value for statistics. In this case the min/max filter should be ignored. +# IMPALA-6527: NaN values lead to incorrect filtering. +# When the first value is NaN in a column chunk, Impala chose it as min_value and +# max_value for statistics. Test if it is no longer the case. create table test_nan(val double) stored as parquet; insert into test_nan values (cast('NaN' as double)), (42); select * from test_nan where val > 0 @@ -502,29 +502,39 @@ select * from test_nan where val > 0 42 ==== ---- QUERY -# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering -# test equality predicate -select * from test_nan where val = 42 +# IMPALA-6527: NaN values lead to incorrect filtering +# Test if '<' predicate produces expected results as well. +select * from test_nan where val < 100 ---- RESULTS 42 ==== ---- QUERY # IMPALA-6527: NaN values lead to incorrect filtering -# test predicate that is true for NaN +# Test if valid statistics are written. The column chunk should be filtered out by +# the min filter. +select * from test_nan where val < 10 +---- RESULTS +---- RUNTIME_PROFILE +aggregation(SUM, NumRowGroups): 1 +aggregation(SUM, NumStatsFilteredRowGroups): 1 +==== +---- QUERY +# IMPALA-6527: NaN values lead to incorrect filtering +# Test predicate that is true for NaN. select * from test_nan where not val >= 0 ---- RESULTS NaN ==== ---- QUERY # IMPALA-6527: NaN values lead to incorrect filtering -# test predicate that is true for NaN +# Test predicate that is true for NaN. select * from test_nan where val != 0 ---- RESULTS NaN 42 ==== ---- QUERY -# Statistics filtering must not filter out row groups if predicate can be true for NaN +# Statistics filtering must not filter out row groups if predicate can be true for NaN. create table test_nan_true_predicate(val double) stored as parquet; insert into test_nan_true_predicate values (10), (20), (cast('NaN' as double)); select * from test_nan_true_predicate where not val >= 0 @@ -532,22 +542,22 @@ select * from test_nan_true_predicate where not val >= 0 NaN ==== ---- QUERY -# NaN is the last element, predicate is true for NaN and value +# NaN is the last element, predicate is true for NaN and value. select * from test_nan_true_predicate where not val >= 20 ---- RESULTS 10 NaN ==== ---- QUERY -# NaN is the last element, predicate is true for NaN and value +# NaN is the last element, predicate is true for NaN and value. select * from test_nan_true_predicate where val != 10 ---- RESULTS 20 NaN ==== ---- QUERY -# Test the case when NaN is inserted between two values -# Test predicate true for NaN and false for the values +# Test the case when NaN is inserted between two values. +# Test predicate true for NaN and false for the values. create table test_nan_in_the_middle(val double) stored as parquet; insert into test_nan_in_the_middle values (10), (cast('NaN' as double)), (20); select * from test_nan_in_the_middle where not val >= 0 @@ -555,16 +565,65 @@ select * from test_nan_in_the_middle where not val >= 0 NaN ==== ---- QUERY -# NaN in the middle, predicate true for NaN and value +# NaN in the middle, predicate true for NaN and value. select * from test_nan_in_the_middle where not val >= 20 ---- RESULTS 10 NaN ==== ---- QUERY -# NaN in the middle, '!=' should return NaN and value +# NaN in the middle, '!=' should return NaN and value. select * from test_nan_in_the_middle where val != 10 ---- RESULTS NaN 20 ==== +---- QUERY +# Test the case when there are only NaNs in the column chunk. +# Test predicate true for NaN +create table test_nan_only(val double) stored as parquet; +insert into test_nan_only values (cast('NaN' as double)), (cast('NaN' as double)), + (cast('NaN' as double)); +select * from test_nan_only where not val >= 0 +---- RESULTS +NaN +NaN +NaN +==== +---- QUERY +# There are only NaN values, predicate is false for NaN +select * from test_nan_only where val >= 20 +---- RESULTS +==== +---- QUERY +# Test the case when a number is following multiple NaNs. +# Test predicate true for NaN, false for the inserted number +create table test_multiple_nans(val double) stored as parquet; +insert into test_multiple_nans values (cast('NaN' as double)), (cast('NaN' as double)), + (cast('NaN' as double)), (20); +select * from test_multiple_nans where not val >= 0 +---- RESULTS +NaN +NaN +NaN +==== +---- QUERY +# Multiple NaNs followed by a number, predicate is false for NaN and true for the number +select * from test_multiple_nans where val >= 20 +---- RESULTS +20 +==== +---- QUERY +# Multiple NaNs followed by a number, predicate is true for NaN and true for the number +select * from test_multiple_nans where not val > 20 +---- RESULTS +NaN +NaN +NaN +20 +==== +---- QUERY +# Multiple NaNs followed by a number, predicate is false for NaN and false for the number +select * from test_multiple_nans where val > 20 +---- RESULTS +====