mirror of
https://github.com/apache/impala.git
synced 2026-02-01 21:00:29 -05:00
The min/max stat predicate is allowed when the left side is not a slot but an implicit cast of a slot. This could lead to incorrectly dropping a row group or page when min/max values were not castable to the type, e.g. it is string with a pre 1400 date and we want to cast it to a timestamp. The change should only affect timestamps, as dates return an error on failed cast from a string, and numeric types won't be cast implicitly from string. The fix is simply to accept NULL result for the min/max predicate in the backend. Note that the alternative solution of casting the right (const) side of the predicate instead of the left side would be tricky, as more than one string can mean the same timestamp, e.g. "1970-01-01" and "1970-01-01 00:00:00". Testing: - added an EE regression test and ran it Change-Id: I35f66e1dfc4523624c249073004f9d5eddd07bb6 Reviewed-on: http://gerrit.cloudera.org:8080/15959 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
776 lines
22 KiB
Plaintext
776 lines
22 KiB
Plaintext
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 0
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
set explain_level=2;
|
|
explain select id, bool_col from functional_parquet.alltypessmall where int_col < 0;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' parquet statistics predicates: int_col < CAST(0 AS INT)'
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where smallint_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where int_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where bigint_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where float_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where double_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test with inverted predicate
|
|
select id, bool_col from functional_parquet.alltypessmall where -1 > int_col
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col > 9
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where smallint_col > 9
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col > 9
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where bigint_col > 90
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where float_col > 9.9
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where double_col > 99
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 10
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col <= 0
|
|
---- RESULTS
|
|
12
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 9
|
|
---- RESULTS
|
|
8
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col = -1
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
set explain_level=2;
|
|
explain select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' parquet statistics predicates: tinyint_col = CAST(10 AS TINYINT)'
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where id >= 30 and id <= 80
|
|
---- RESULTS
|
|
51
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
# Mix with partitioning columns
|
|
select count(*) from functional_parquet.alltypessmall where int_col < 0 and year < 2012
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test that without expr rewrite and thus without constant folding, constant exprs still
|
|
# can be used to prune row groups.
|
|
set enable_expr_rewrites=0;
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test that without expr rewrite and thus without constant folding, constant exprs still
|
|
# can be used to prune row groups.
|
|
set enable_expr_rewrites=0;
|
|
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test name based column resolution
|
|
create table name_resolve stored as parquet as select * from functional_parquet.alltypessmall;
|
|
alter table name_resolve replace columns (int_col int, bool_col boolean, tinyint_col tinyint, smallint_col smallint, id int);
|
|
set parquet_fallback_schema_resolution=NAME;
|
|
# If this picks up the stats from int_col, then it will filter all row groups and return
|
|
# an incorrect result.
|
|
select count(*) from name_resolve where id > 10;
|
|
---- RESULTS
|
|
89
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Query that has an implicit cast to a larger integer type
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col > 1000000000000
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Predicates with explicit casts are not supported when evaluating parquet::Statistics.
|
|
select count(*) from functional_parquet.alltypessmall where '0' > cast(tinyint_col as string)
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Explicit casts between numerical types can violate the transitivity of "min()", so they
|
|
# are not supported when evaluating parquet::Statistics.
|
|
select count(*) from functional_parquet.alltypes where cast(id as tinyint) < 10;
|
|
---- RESULTS
|
|
3878
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.complextypestbl.int_array where pos < 5;
|
|
---- RESULTS
|
|
9
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Test the conversion of constant IN lists to min/max predicats
|
|
select count(*) from functional_parquet.alltypes where int_col in (-1,-2,-3,-4);
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 24
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypes where id IN (1,25,49);
|
|
---- RESULTS
|
|
3
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 23
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col < "0"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col <= "/"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col < "1"
|
|
---- RESULTS
|
|
12
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col >= "9"
|
|
---- RESULTS
|
|
8
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col > ":"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col < "2009-01-01 00:00:00"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col <= "2009-01-01 00:00:00"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-01-01 00:00:00"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col > "2009-04-03 00:24:00.96"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col >= "2009-04-03 00:24:00.96"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-04-03 00:24:00.96"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.date_tbl
|
|
where date_part in ("2017-11-27", "1399-06-27") and date_col < '0001-06-21';
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 2
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.date_tbl
|
|
where date_part in ("2017-11-27", "1399-06-27") and date_col <= '0001-06-21';
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.date_tbl
|
|
where date_part in ("2017-11-27", "1399-06-27") and date_col = '0001-06-21';
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.date_tbl
|
|
where date_part in ("2017-11-27", "1399-06-27") and date_col > '2018-12-31';
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 2
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.date_tbl
|
|
where date_part in ("2017-11-27", "1399-06-27") and date_col >= '2018-12-31';
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.date_tbl
|
|
where date_part in ("2017-11-27", "1399-06-27") and date_col = '2018-12-31';
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d1 < 1234
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d3 < 1.23456789
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456788
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456789
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d4 > 0.123456789
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345678
|
|
---- RESULTS
|
|
5
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345679
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
# Test that stats are disabled for CHAR type columns.
|
|
create table chars (id int, c char(4)) stored as parquet;
|
|
insert into chars values (1, cast("abaa" as char(4))), (2, cast("abab" as char(4)));
|
|
select count(*) from chars;
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select count(*) from chars where c <= "aaaa"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# IMPALA-4988: Test that stats support can be disabled using the parquet_read_statistics
|
|
# query option.
|
|
set parquet_read_statistics=0;
|
|
select count(*) from functional_parquet.alltypes where id < 0;
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Check that all the row groups are skipped using null_count stat
|
|
create table table_for_null_count_test (i int, j int) stored as parquet;
|
|
insert into table_for_null_count_test values (1, NULL), (2, NULL), (3, NULL);
|
|
select count(*) from table_for_null_count_test where j < 3;
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
# Insert another row group where not all the 'j' values are NULL
|
|
insert into table_for_null_count_test values (4, 1), (5, NULL);
|
|
select i from table_for_null_count_test where j < 3;
|
|
---- RESULTS
|
|
4
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
# Turning off parquet stats and verifying that no row groups are skipped
|
|
set PARQUET_READ_STATISTICS=0;
|
|
create table table_for_null_count_test2 (i int, j int) stored as parquet;
|
|
insert into table_for_null_count_test2 values (1, NULL), (2, NULL), (3, NULL);
|
|
select count(*) from table_for_null_count_test2 where j < 3;
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# IMPALA-6527: NaN values lead to incorrect filtering.
|
|
# When the first value is NaN in a column chunk, Impala chose it as min_value and
|
|
# max_value for statistics. Test if it is no longer the case.
|
|
create table test_nan(val double) stored as parquet;
|
|
insert into test_nan values (cast('NaN' as double)), (42);
|
|
select * from test_nan where val > 0
|
|
---- RESULTS
|
|
42
|
|
====
|
|
---- QUERY
|
|
# IMPALA-6527: NaN values lead to incorrect filtering
|
|
# Test if '<' predicate produces expected results as well.
|
|
select * from test_nan where val < 100
|
|
---- RESULTS
|
|
42
|
|
====
|
|
---- QUERY
|
|
# IMPALA-6527: NaN values lead to incorrect filtering
|
|
# Test if valid statistics are written. The column chunk should be filtered out by
|
|
# the min filter.
|
|
select * from test_nan where val < 10
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
# IMPALA-6527: NaN values lead to incorrect filtering
|
|
# Test predicate that is true for NaN.
|
|
select * from test_nan where not val >= 0
|
|
---- RESULTS
|
|
NaN
|
|
====
|
|
---- QUERY
|
|
# IMPALA-6527: NaN values lead to incorrect filtering
|
|
# Test predicate that is true for NaN.
|
|
select * from test_nan where val != 0
|
|
---- RESULTS
|
|
NaN
|
|
42
|
|
====
|
|
---- QUERY
|
|
# Statistics filtering must not filter out row groups if predicate can be true for NaN.
|
|
create table test_nan_true_predicate(val double) stored as parquet;
|
|
insert into test_nan_true_predicate values (10), (20), (cast('NaN' as double));
|
|
select * from test_nan_true_predicate where not val >= 0
|
|
---- RESULTS
|
|
NaN
|
|
====
|
|
---- QUERY
|
|
# NaN is the last element, predicate is true for NaN and value.
|
|
select * from test_nan_true_predicate where not val >= 20
|
|
---- RESULTS
|
|
10
|
|
NaN
|
|
====
|
|
---- QUERY
|
|
# NaN is the last element, predicate is true for NaN and value.
|
|
select * from test_nan_true_predicate where val != 10
|
|
---- RESULTS
|
|
20
|
|
NaN
|
|
====
|
|
---- QUERY
|
|
# Test the case when NaN is inserted between two values.
|
|
# Test predicate true for NaN and false for the values.
|
|
create table test_nan_in_the_middle(val double) stored as parquet;
|
|
insert into test_nan_in_the_middle values (10), (cast('NaN' as double)), (20);
|
|
select * from test_nan_in_the_middle where not val >= 0
|
|
---- RESULTS
|
|
NaN
|
|
====
|
|
---- QUERY
|
|
# NaN in the middle, predicate true for NaN and value.
|
|
select * from test_nan_in_the_middle where not val >= 20
|
|
---- RESULTS
|
|
10
|
|
NaN
|
|
====
|
|
---- QUERY
|
|
# NaN in the middle, '!=' should return NaN and value.
|
|
select * from test_nan_in_the_middle where val != 10
|
|
---- RESULTS
|
|
NaN
|
|
20
|
|
====
|
|
---- QUERY
|
|
# Test the case when there are only NaNs in the column chunk.
|
|
# Test predicate true for NaN
|
|
create table test_nan_only(val double) stored as parquet;
|
|
insert into test_nan_only values (cast('NaN' as double)), (cast('NaN' as double)),
|
|
(cast('NaN' as double));
|
|
select * from test_nan_only where not val >= 0
|
|
---- RESULTS
|
|
NaN
|
|
NaN
|
|
NaN
|
|
====
|
|
---- QUERY
|
|
# There are only NaN values, predicate is false for NaN
|
|
select * from test_nan_only where val >= 20
|
|
---- RESULTS
|
|
====
|
|
---- QUERY
|
|
# Test the case when a number is following multiple NaNs.
|
|
# Test predicate true for NaN, false for the inserted number
|
|
create table test_multiple_nans(val double) stored as parquet;
|
|
insert into test_multiple_nans values (cast('NaN' as double)), (cast('NaN' as double)),
|
|
(cast('NaN' as double)), (20);
|
|
select * from test_multiple_nans where not val >= 0
|
|
---- RESULTS
|
|
NaN
|
|
NaN
|
|
NaN
|
|
====
|
|
---- QUERY
|
|
# Multiple NaNs followed by a number, predicate is false for NaN and true for the number
|
|
select * from test_multiple_nans where val >= 20
|
|
---- RESULTS
|
|
20
|
|
====
|
|
---- QUERY
|
|
# Multiple NaNs followed by a number, predicate is true for NaN and true for the number
|
|
select * from test_multiple_nans where not val > 20
|
|
---- RESULTS
|
|
NaN
|
|
NaN
|
|
NaN
|
|
20
|
|
====
|
|
---- QUERY
|
|
# Multiple NaNs followed by a number, predicate is false for NaN and false for the number
|
|
select * from test_multiple_nans where val > 20
|
|
---- RESULTS
|
|
====
|
|
---- QUERY
|
|
# Test the cases when column data types are changed from 32 bit integer to 8 bit integer,
|
|
# from 16 bit integer to 8 bit integer, and from 32 bit integer to 16 bit integer.
|
|
# Some existing data values in the table will be overflow and cause the min/max stats
|
|
# values to be invalid.
|
|
set PARQUET_READ_STATISTICS=1;
|
|
create table tnarrow(i32_to_8 int, i16_to_8 smallint, i32_to_16 int) stored as parquet;
|
|
insert into tnarrow values (1, 2, 3), (200, 201, 40000);
|
|
alter table tnarrow change column i32_to_8 i32_to_8 tinyint;
|
|
alter table tnarrow change column i16_to_8 i16_to_8 tinyint;
|
|
alter table tnarrow change column i32_to_16 i32_to_16 smallint;
|
|
insert into tnarrow values (5, 6, 7), (-5, -6, -7);
|
|
select count(*) from tnarrow;
|
|
---- RESULTS
|
|
4
|
|
====
|
|
---- QUERY
|
|
# Test the case when column data type is changed from 32 bit integer to 8 bit integer.
|
|
select count(*) from tnarrow where i32_to_8 < 0;
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select count(*) from tnarrow where i32_to_8 > 0;
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select count(*) from tnarrow where i32_to_8 = 1;
|
|
---- RESULTS
|
|
1
|
|
====
|
|
---- QUERY
|
|
# When the column data type is changed, 200 is overflown as value -56.
|
|
select count(*) from tnarrow where i32_to_8 = -56;
|
|
---- RESULTS
|
|
1
|
|
====
|
|
---- QUERY
|
|
# Test the case when column data type is changed from 16 bit integer to 8 bit integer.
|
|
select count(*) from tnarrow where i16_to_8 < 0;
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select count(*) from tnarrow where i16_to_8 > 0;
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select count(*) from tnarrow where i16_to_8 = 2;
|
|
---- RESULTS
|
|
1
|
|
====
|
|
---- QUERY
|
|
# When the column data type is changed, 201 is overflown as value -55.
|
|
select count(*) from tnarrow where i16_to_8 = -55;
|
|
---- RESULTS
|
|
1
|
|
====
|
|
---- QUERY
|
|
# Test the case when column data type is changed from 32 bit integer to 16 bit integer.
|
|
select count(*) from tnarrow where i32_to_16 < 0;
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select count(*) from tnarrow where i32_to_16 > 0;
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select count(*) from tnarrow where i32_to_16 = 3;
|
|
---- RESULTS
|
|
1
|
|
====
|
|
---- QUERY
|
|
# When the column data type is changed, 40000 is overflown as value -25536.
|
|
select count(*) from tnarrow where i32_to_16 = -25536;
|
|
---- RESULTS
|
|
1
|
|
====
|
|
---- QUERY
|
|
# Regression test for IMPALA-9707.
|
|
# The min/max predicate contains a cast(as timestamp), which lead to incorrectly dropping
|
|
# the whole row group if the min/max value was not a valid timestamp string.
|
|
create table ts (dt string) stored as parquet;
|
|
insert into ts values ("2010-01-01"), ("non ts");
|
|
select count(*) from ts where dt = cast("2010-01-01" as timestamp);
|
|
---- RESULTS
|
|
1
|
|
====
|