IMPALA-6542: Fix inconsistent write path of Parquet min/max statistics

Quick fix of Parquet write path until the Parquet community agrees on the ordering of floating point numbers. The behavior follows the way fmax()/fmin() works, ie. Impala will only write NaN into the stats when all the values are NaNs. This behavior is aligned with the quick fix of Parquet-CPP. Added e2e tests as well. Change-Id: I3957806948f7c661af4be5495f2ec92d1e9fc9d6 Reviewed-on: http://gerrit.cloudera.org:8080/9381 Reviewed-by: Lars Volker <lv@cloudera.com> Tested-by: Impala Public Jenkins
2026-01-07 00:02:28 -05:00 · 2018-02-21 17:29:02 +01:00
parent 38cbff33e1
commit 5d044e0cb2
3 changed files with 90 additions and 17 deletions
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
@@ -492,9 +492,9 @@ aggregation(SUM, NumRowGroups): 1
 aggregation(SUM, NumStatsFilteredRowGroups): 0
 ====
 ---- QUERY
-# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering.
-# When the first value is NaN in a column chunk, Impala chooses it as min_value and
-# max_value for statistics. In this case the min/max filter should be ignored.
+# IMPALA-6527: NaN values lead to incorrect filtering.
+# When the first value is NaN in a column chunk, Impala chose it as min_value and
+# max_value for statistics. Test if it is no longer the case.
 create table test_nan(val double) stored as parquet;
 insert into test_nan values (cast('NaN' as double)), (42);
 select * from test_nan where val > 0
@@ -502,29 +502,39 @@ select * from test_nan where val > 0
 42
 ====
 ---- QUERY
-# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering
-# test equality predicate
-select * from test_nan where val = 42
+# IMPALA-6527: NaN values lead to incorrect filtering
+# Test if '<' predicate produces expected results as well.
+select * from test_nan where val < 100
 ---- RESULTS
 42
 ====
 ---- QUERY
 # IMPALA-6527: NaN values lead to incorrect filtering
-# test predicate that is true for NaN
+# Test if valid statistics are written. The column chunk should be filtered out by
+# the min filter.
+select * from test_nan where val < 10
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# IMPALA-6527: NaN values lead to incorrect filtering
+# Test predicate that is true for NaN.
 select * from test_nan where not val >= 0
 ---- RESULTS
 NaN
 ====
 ---- QUERY
 # IMPALA-6527: NaN values lead to incorrect filtering
-# test predicate that is true for NaN
+# Test predicate that is true for NaN.
 select * from test_nan where val != 0
 ---- RESULTS
 NaN
 42
 ====
 ---- QUERY
-# Statistics filtering must not filter out row groups if predicate can be true for NaN
+# Statistics filtering must not filter out row groups if predicate can be true for NaN.
 create table test_nan_true_predicate(val double) stored as parquet;
 insert into test_nan_true_predicate values (10), (20), (cast('NaN' as double));
 select * from test_nan_true_predicate where not val >= 0
@@ -532,22 +542,22 @@ select * from test_nan_true_predicate where not val >= 0
 NaN
 ====
 ---- QUERY
-# NaN is the last element, predicate is true for NaN and value
+# NaN is the last element, predicate is true for NaN and value.
 select * from test_nan_true_predicate where not val >= 20
 ---- RESULTS
 10
 NaN
 ====
 ---- QUERY
-# NaN is the last element, predicate is true for NaN and value
+# NaN is the last element, predicate is true for NaN and value.
 select * from test_nan_true_predicate where val != 10
 ---- RESULTS
 20
 NaN
 ====
 ---- QUERY
-# Test the case when NaN is inserted between two values
-# Test predicate true for NaN and false for the values
+# Test the case when NaN is inserted between two values.
+# Test predicate true for NaN and false for the values.
 create table test_nan_in_the_middle(val double) stored as parquet;
 insert into test_nan_in_the_middle values (10), (cast('NaN' as double)), (20);
 select * from test_nan_in_the_middle where not val >= 0
@@ -555,16 +565,65 @@ select * from test_nan_in_the_middle where not val >= 0
 NaN
 ====
 ---- QUERY
-# NaN in the middle, predicate true for NaN and value
+# NaN in the middle, predicate true for NaN and value.
 select * from test_nan_in_the_middle where not val >= 20
 ---- RESULTS
 10
 NaN
 ====
 ---- QUERY
-# NaN in the middle, '!=' should return NaN and value
+# NaN in the middle, '!=' should return NaN and value.
 select * from test_nan_in_the_middle where val != 10
 ---- RESULTS
 NaN
 20
 ====
+---- QUERY
+# Test the case when there are only NaNs in the column chunk.
+# Test predicate true for NaN
+create table test_nan_only(val double) stored as parquet;
+insert into test_nan_only values (cast('NaN' as double)), (cast('NaN' as double)),
+    (cast('NaN' as double));
+select * from test_nan_only where not val >= 0
+---- RESULTS
+NaN
+NaN
+NaN
+====
+---- QUERY
+# There are only NaN values, predicate is false for NaN
+select * from test_nan_only where val >= 20
+---- RESULTS
+====
+---- QUERY
+# Test the case when a number is following multiple NaNs.
+# Test predicate true for NaN, false for the inserted number
+create table test_multiple_nans(val double) stored as parquet;
+insert into test_multiple_nans values (cast('NaN' as double)), (cast('NaN' as double)),
+    (cast('NaN' as double)), (20);
+select * from test_multiple_nans where not val >= 0
+---- RESULTS
+NaN
+NaN
+NaN
+====
+---- QUERY
+# Multiple NaNs followed by a number, predicate is false for NaN and true for the number
+select * from test_multiple_nans where val >= 20
+---- RESULTS
+20
+====
+---- QUERY
+# Multiple NaNs followed by a number, predicate is true for NaN and true for the number
+select * from test_multiple_nans where not val > 20
+---- RESULTS
+NaN
+NaN
+NaN
+20
+====
+---- QUERY
+# Multiple NaNs followed by a number, predicate is false for NaN and false for the number
+select * from test_multiple_nans where val > 20
+---- RESULTS
+====