Files
impala/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
Csaba Ringhofer fb28011b57 IMPALA-9707: fix Parquet stat filtering when min/max values are cast to NULL
The min/max stat predicate is allowed when the left side is not a slot
but an implicit cast of a slot. This could lead to incorrectly dropping
a row group or page when min/max values were not castable to the type,
e.g. it is string with a pre 1400 date and we want to cast it to a
timestamp.

The change should only affect timestamps, as dates return an error
on failed cast from a string, and numeric types won't be cast
implicitly from string.

The fix is simply to accept NULL result for the min/max predicate in
the backend. Note that the alternative solution of casting the right
(const) side of the predicate instead of the left side would be tricky,
as more than one string can mean the same timestamp, e.g.
"1970-01-01" and "1970-01-01 00:00:00".

Testing:
- added an EE regression test and ran it

Change-Id: I35f66e1dfc4523624c249073004f9d5eddd07bb6
Reviewed-on: http://gerrit.cloudera.org:8080/15959
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2020-05-21 18:40:20 +00:00

776 lines
22 KiB
Plaintext

====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col < 0
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
set explain_level=2;
explain select id, bool_col from functional_parquet.alltypessmall where int_col < 0;
---- RESULTS: VERIFY_IS_SUBSET
' parquet statistics predicates: int_col < CAST(0 AS INT)'
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where smallint_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where int_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where bigint_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where float_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where double_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test with inverted predicate
select id, bool_col from functional_parquet.alltypessmall where -1 > int_col
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col > 9
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where smallint_col > 9
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col > 9
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where bigint_col > 90
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where float_col > 9.9
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where double_col > 99
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 10
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col <= 0
---- RESULTS
12
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 9
---- RESULTS
8
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col = -1
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
set explain_level=2;
explain select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
---- RESULTS: VERIFY_IS_SUBSET
' parquet statistics predicates: tinyint_col = CAST(10 AS TINYINT)'
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where id >= 30 and id <= 80
---- RESULTS
51
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
# Mix with partitioning columns
select count(*) from functional_parquet.alltypessmall where int_col < 0 and year < 2012
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test that without expr rewrite and thus without constant folding, constant exprs still
# can be used to prune row groups.
set enable_expr_rewrites=0;
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test that without expr rewrite and thus without constant folding, constant exprs still
# can be used to prune row groups.
set enable_expr_rewrites=0;
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test name based column resolution
create table name_resolve stored as parquet as select * from functional_parquet.alltypessmall;
alter table name_resolve replace columns (int_col int, bool_col boolean, tinyint_col tinyint, smallint_col smallint, id int);
set parquet_fallback_schema_resolution=NAME;
# If this picks up the stats from int_col, then it will filter all row groups and return
# an incorrect result.
select count(*) from name_resolve where id > 10;
---- RESULTS
89
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Query that has an implicit cast to a larger integer type
select count(*) from functional_parquet.alltypessmall where tinyint_col > 1000000000000
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Predicates with explicit casts are not supported when evaluating parquet::Statistics.
select count(*) from functional_parquet.alltypessmall where '0' > cast(tinyint_col as string)
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Explicit casts between numerical types can violate the transitivity of "min()", so they
# are not supported when evaluating parquet::Statistics.
select count(*) from functional_parquet.alltypes where cast(id as tinyint) < 10;
---- RESULTS
3878
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.complextypestbl.int_array where pos < 5;
---- RESULTS
9
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Test the conversion of constant IN lists to min/max predicats
select count(*) from functional_parquet.alltypes where int_col in (-1,-2,-3,-4);
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 24
====
---- QUERY
select count(*) from functional_parquet.alltypes where id IN (1,25,49);
---- RESULTS
3
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 23
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col < "0"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col <= "/"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col < "1"
---- RESULTS
12
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col >= "9"
---- RESULTS
8
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col > ":"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col < "2009-01-01 00:00:00"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col <= "2009-01-01 00:00:00"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-01-01 00:00:00"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col > "2009-04-03 00:24:00.96"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col >= "2009-04-03 00:24:00.96"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-04-03 00:24:00.96"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.date_tbl
where date_part in ("2017-11-27", "1399-06-27") and date_col < '0001-06-21';
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 2
====
---- QUERY
select count(*) from functional_parquet.date_tbl
where date_part in ("2017-11-27", "1399-06-27") and date_col <= '0001-06-21';
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.date_tbl
where date_part in ("2017-11-27", "1399-06-27") and date_col = '0001-06-21';
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.date_tbl
where date_part in ("2017-11-27", "1399-06-27") and date_col > '2018-12-31';
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 2
====
---- QUERY
select count(*) from functional_parquet.date_tbl
where date_part in ("2017-11-27", "1399-06-27") and date_col >= '2018-12-31';
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.date_tbl
where date_part in ("2017-11-27", "1399-06-27") and date_col = '2018-12-31';
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d1 < 1234
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d3 < 1.23456789
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456788
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456789
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d4 > 0.123456789
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345678
---- RESULTS
5
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345679
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
# Test that stats are disabled for CHAR type columns.
create table chars (id int, c char(4)) stored as parquet;
insert into chars values (1, cast("abaa" as char(4))), (2, cast("abab" as char(4)));
select count(*) from chars;
---- RESULTS
2
====
---- QUERY
select count(*) from chars where c <= "aaaa"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# IMPALA-4988: Test that stats support can be disabled using the parquet_read_statistics
# query option.
set parquet_read_statistics=0;
select count(*) from functional_parquet.alltypes where id < 0;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Check that all the row groups are skipped using null_count stat
create table table_for_null_count_test (i int, j int) stored as parquet;
insert into table_for_null_count_test values (1, NULL), (2, NULL), (3, NULL);
select count(*) from table_for_null_count_test where j < 3;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
# Insert another row group where not all the 'j' values are NULL
insert into table_for_null_count_test values (4, 1), (5, NULL);
select i from table_for_null_count_test where j < 3;
---- RESULTS
4
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
# Turning off parquet stats and verifying that no row groups are skipped
set PARQUET_READ_STATISTICS=0;
create table table_for_null_count_test2 (i int, j int) stored as parquet;
insert into table_for_null_count_test2 values (1, NULL), (2, NULL), (3, NULL);
select count(*) from table_for_null_count_test2 where j < 3;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# IMPALA-6527: NaN values lead to incorrect filtering.
# When the first value is NaN in a column chunk, Impala chose it as min_value and
# max_value for statistics. Test if it is no longer the case.
create table test_nan(val double) stored as parquet;
insert into test_nan values (cast('NaN' as double)), (42);
select * from test_nan where val > 0
---- RESULTS
42
====
---- QUERY
# IMPALA-6527: NaN values lead to incorrect filtering
# Test if '<' predicate produces expected results as well.
select * from test_nan where val < 100
---- RESULTS
42
====
---- QUERY
# IMPALA-6527: NaN values lead to incorrect filtering
# Test if valid statistics are written. The column chunk should be filtered out by
# the min filter.
select * from test_nan where val < 10
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
# IMPALA-6527: NaN values lead to incorrect filtering
# Test predicate that is true for NaN.
select * from test_nan where not val >= 0
---- RESULTS
NaN
====
---- QUERY
# IMPALA-6527: NaN values lead to incorrect filtering
# Test predicate that is true for NaN.
select * from test_nan where val != 0
---- RESULTS
NaN
42
====
---- QUERY
# Statistics filtering must not filter out row groups if predicate can be true for NaN.
create table test_nan_true_predicate(val double) stored as parquet;
insert into test_nan_true_predicate values (10), (20), (cast('NaN' as double));
select * from test_nan_true_predicate where not val >= 0
---- RESULTS
NaN
====
---- QUERY
# NaN is the last element, predicate is true for NaN and value.
select * from test_nan_true_predicate where not val >= 20
---- RESULTS
10
NaN
====
---- QUERY
# NaN is the last element, predicate is true for NaN and value.
select * from test_nan_true_predicate where val != 10
---- RESULTS
20
NaN
====
---- QUERY
# Test the case when NaN is inserted between two values.
# Test predicate true for NaN and false for the values.
create table test_nan_in_the_middle(val double) stored as parquet;
insert into test_nan_in_the_middle values (10), (cast('NaN' as double)), (20);
select * from test_nan_in_the_middle where not val >= 0
---- RESULTS
NaN
====
---- QUERY
# NaN in the middle, predicate true for NaN and value.
select * from test_nan_in_the_middle where not val >= 20
---- RESULTS
10
NaN
====
---- QUERY
# NaN in the middle, '!=' should return NaN and value.
select * from test_nan_in_the_middle where val != 10
---- RESULTS
NaN
20
====
---- QUERY
# Test the case when there are only NaNs in the column chunk.
# Test predicate true for NaN
create table test_nan_only(val double) stored as parquet;
insert into test_nan_only values (cast('NaN' as double)), (cast('NaN' as double)),
(cast('NaN' as double));
select * from test_nan_only where not val >= 0
---- RESULTS
NaN
NaN
NaN
====
---- QUERY
# There are only NaN values, predicate is false for NaN
select * from test_nan_only where val >= 20
---- RESULTS
====
---- QUERY
# Test the case when a number is following multiple NaNs.
# Test predicate true for NaN, false for the inserted number
create table test_multiple_nans(val double) stored as parquet;
insert into test_multiple_nans values (cast('NaN' as double)), (cast('NaN' as double)),
(cast('NaN' as double)), (20);
select * from test_multiple_nans where not val >= 0
---- RESULTS
NaN
NaN
NaN
====
---- QUERY
# Multiple NaNs followed by a number, predicate is false for NaN and true for the number
select * from test_multiple_nans where val >= 20
---- RESULTS
20
====
---- QUERY
# Multiple NaNs followed by a number, predicate is true for NaN and true for the number
select * from test_multiple_nans where not val > 20
---- RESULTS
NaN
NaN
NaN
20
====
---- QUERY
# Multiple NaNs followed by a number, predicate is false for NaN and false for the number
select * from test_multiple_nans where val > 20
---- RESULTS
====
---- QUERY
# Test the cases when column data types are changed from 32 bit integer to 8 bit integer,
# from 16 bit integer to 8 bit integer, and from 32 bit integer to 16 bit integer.
# Some existing data values in the table will be overflow and cause the min/max stats
# values to be invalid.
set PARQUET_READ_STATISTICS=1;
create table tnarrow(i32_to_8 int, i16_to_8 smallint, i32_to_16 int) stored as parquet;
insert into tnarrow values (1, 2, 3), (200, 201, 40000);
alter table tnarrow change column i32_to_8 i32_to_8 tinyint;
alter table tnarrow change column i16_to_8 i16_to_8 tinyint;
alter table tnarrow change column i32_to_16 i32_to_16 smallint;
insert into tnarrow values (5, 6, 7), (-5, -6, -7);
select count(*) from tnarrow;
---- RESULTS
4
====
---- QUERY
# Test the case when column data type is changed from 32 bit integer to 8 bit integer.
select count(*) from tnarrow where i32_to_8 < 0;
---- RESULTS
2
====
---- QUERY
select count(*) from tnarrow where i32_to_8 > 0;
---- RESULTS
2
====
---- QUERY
select count(*) from tnarrow where i32_to_8 = 1;
---- RESULTS
1
====
---- QUERY
# When the column data type is changed, 200 is overflown as value -56.
select count(*) from tnarrow where i32_to_8 = -56;
---- RESULTS
1
====
---- QUERY
# Test the case when column data type is changed from 16 bit integer to 8 bit integer.
select count(*) from tnarrow where i16_to_8 < 0;
---- RESULTS
2
====
---- QUERY
select count(*) from tnarrow where i16_to_8 > 0;
---- RESULTS
2
====
---- QUERY
select count(*) from tnarrow where i16_to_8 = 2;
---- RESULTS
1
====
---- QUERY
# When the column data type is changed, 201 is overflown as value -55.
select count(*) from tnarrow where i16_to_8 = -55;
---- RESULTS
1
====
---- QUERY
# Test the case when column data type is changed from 32 bit integer to 16 bit integer.
select count(*) from tnarrow where i32_to_16 < 0;
---- RESULTS
2
====
---- QUERY
select count(*) from tnarrow where i32_to_16 > 0;
---- RESULTS
2
====
---- QUERY
select count(*) from tnarrow where i32_to_16 = 3;
---- RESULTS
1
====
---- QUERY
# When the column data type is changed, 40000 is overflown as value -25536.
select count(*) from tnarrow where i32_to_16 = -25536;
---- RESULTS
1
====
---- QUERY
# Regression test for IMPALA-9707.
# The min/max predicate contains a cast(as timestamp), which lead to incorrectly dropping
# the whole row group if the min/max value was not a valid timestamp string.
create table ts (dt string) stored as parquet;
insert into ts values ("2010-01-01"), ("non ts");
select count(*) from ts where dt = cast("2010-01-01" as timestamp);
---- RESULTS
1
====